def prepareImages(args, PE=-1): batch_sz = len(args['images']) inputs, shapes = loadImages(args['images'], args['transform'], args['img_mean'], args['in_shape']) fpgaInputs = xdnn.prepareInputsForFpga(inputs, args['quantizecfg'], args['scaleB'], PE, args['firstfpgalayer']) return fpgaInputs, shapes, batch_sz
def prepareImages(args, PE=-1): batch_sz = len(args['images']) inputs, shapes = loadImages(args['images'], args['transform'], args['img_raw_scale'], args['img_mean'], args['img_input_scale'], args['in_shape']) #print('image shape {}, path {}, {}'.format(inputs.shape, args['images'], inputs[:10])) fpgaInputs = xdnn.prepareInputsForFpga(inputs, args['quantizecfg'], args['scaleB'], PE, args['firstfpgalayer']) return fpgaInputs, shapes, batch_sz
def prepareRawInputs(args, PE=-1): if args['batch_sz'] != 1: batch_sz = int(args['batch_sz']) else: batch_sz = 1 inputs, shapes = loadRawDataInput(args['datadir'], batch_sz, args['in_shape']) fpgaInputs = xdnn.prepareInputsForFpga(inputs, args['quantizecfg'], args['scaleB'], PE, args['firstfpgalayer']) return fpgaInputs, shapes, batch_sz
def prepareRawInputs(args, PE=-1): print('import Raw Input') if args['batch_sz'] != 1: batch_sz = int(args['batch_sz']) else: batch_sz = 1 inputs, shapes = loadRawDataInput(args['datadir'], batch_sz, args['in_shape']) #print('image shape {}, path {}, {}'.format(inputs.shape, args['datadir'], inputs[:10])) fpgaInputs = xdnn.prepareInputsForFpga(inputs, args['quantizecfg'], args['scaleB'], PE, args['firstfpgalayer']) return fpgaInputs, shapes, batch_sz
def prepareFpgaInputs(inputs): fpgaInputs = xdnn.prepareInputsForFpga(inputs, g_fpgaCfgFile, g_scaleB, -1, g_firstFpgaLayerName) return fpgaInputs
def img_classify(msg): global g_inputs global g_inputbuf global g_fpgaOutput global g_weightsBlob global g_fcWeight global g_fcBias # message is a rowset, one col, a list of file names. rs = msg.rowset if len(rs.columns) == 0 or rs.columns[0].nrow == 0: print("Img classify request size is 0.\n") return None print("Img classify request size is {0}.\n".format(rs.columns[0].nrow)) # Lock the fpga device. config is protected by this lock as well. fpga_lock.acquire() ret = None for i in range(rs.columns[0].nrow): fname = rs.columns[0].sdata[i] print("Running classification for images: {0}\n".format(fname)) print("Prepare inputs ...\n") # g_batchSize = 1, for now. print "g_inputs", g_inputs g_inputs[0] = xdnn_io.loadImageBlobFromFile(str(fname), g_mean, g_img_h, g_img_w) print("Quantize inputs ...\n") quantizeInputs = xdnn.quantizeInputs(g_firstFpgaLayerName, g_inputs, None, None, g_fpgaCfgFile, g_scaleB) print("Prepare inputs for fpga inputs ...\n") fpgaInputs = xdnn.prepareInputsForFpga(quantizeInputs, g_fpgaCfgFile, g_scaleB, -1, g_firstFpgaLayerName) print("Run FPGA commands ...\n") xdnn.execute(g_netFile, g_weightsBlob, fpgaInputs, g_fpgaOutput, g_batchSize, g_fpgaCfgFile, g_scaleB, g_PE) print("Compute FC ...\n") fcOutput = xdnn.computeFC(g_fcWeight, g_fcBias, g_fpgaOutput, g_batchSize, g_outputSize, g_fpgaOutputSize, g_useBlas) print("Softmax ...\n") softmaxOut = xdnn.computeSoftmax(fcOutput, g_batchSize) ret = get_classification(softmaxOut, fname) fpga_lock.release() # Now construct return msg if ret == None: print("Return None: ???\n") return None retmsg = xdrive_pb2.XMsg() rs = retmsg.rowset # return 4 columns, (filename, ordinal, score, class) col1 = rs.columns.add() col2 = rs.columns.add() col3 = rs.columns.add() col4 = rs.columns.add() col1.nrow = len(ret) col2.nrow = len(ret) col3.nrow = len(ret) col4.nrow = len(ret) for i in range(len(ret)): (a, b, c, d) = ret[i] # print("Return {0}, {1}, {2}, {3}.\n".format(a, b, c, d)) col1.nullmap.append(False) col1.sdata.append(a) col2.nullmap.append(False) col2.i32data.append(b) col3.nullmap.append(False) col3.f64data.append(c) col4.nullmap.append(False) col4.sdata.append(d) return retmsg
def XDLFprepareRawInputs(args, RawInputs, PE=-1): fpgaInputs = xdnn.prepareInputsForFpga(RawInputs, args['quantizecfg'], args['scaleB'], PE, args['firstfpgalayer']) return fpgaInputs
def benchmark(): mode = "Non-Blocking" #mode = "Blocking" # Extract Arguments from json args = xdnn_io.processCommandLine()["jsoncfg"][0] if "platform" in args: args["xclbin"] = "../../overlaybins/" + str( args["platform"]) + "/" + args["xclbin"] # Establish Communication w/ FPGA if xdnn.createHandle(args['xclbin'], libFile=args['xlnxlib']): sys.exit(1) # Transfer weights to device memory if "usexdnnv3" in args and args["usexdnnv3"] == "1": weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args) else: weightsBlob = xdnn_io.loadWeightsBiasQuant(args) # Create random input data fpgaInputs = [] fpgaInputs.append( np.float32( np.random.standard_normal( (args["batchsz"], reduce(mul, args["in_shape"], 1))))) fpgaInputs[0] = xdnn.quantizeInputs(args["firstfpgalayer"], args["quantizecfg"], args["scaleB"], fpgaInputs[0]) fpgaInputs[0] = xdnn.prepareInputsForFpga(fpgaInputs[0], args["quantizecfg"], args["scaleB"], -1, args["firstfpgalayer"], 0) fpgaInputs.append( np.float32( np.random.standard_normal( (args["batchsz"], reduce(mul, args["in_shape"], 1))))) fpgaInputs[1] = xdnn.quantizeInputs(args["firstfpgalayer"], args["quantizecfg"], args["scaleB"], fpgaInputs[1]) fpgaInputs[1] = xdnn.prepareInputsForFpga(fpgaInputs[1], args["quantizecfg"], args["scaleB"], -1, args["firstfpgalayer"], 1) # Create buffers in host memory for result fpgaOutputs = [] fpgaOutputs.append( xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"])) fpgaOutputs.append( xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"])) # Load network schedule to accelerator xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 0) xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 1) # Run forward propagation N times print("Running inference...\n") cumulative_time = -1 * timeit.default_timer() if mode == "Non-Blocking": xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0], fpgaOutputs[0], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 0) xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1], fpgaOutputs[1], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 1) for i in range(args["iterations"] / 2 - 1): xdnn.get_result(-1, 0) # get 0 xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0], fpgaOutputs[0], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 0) # push 0 xdnn.get_result(-1, 1) # get 1 xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1], fpgaOutputs[1], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE'], 1) # push 1 xdnn.get_result(-1, 0) # get 0 xdnn.get_result(-1, 1) # get 1 else: for i in range(args["iterations"]): xdnn.execute(args['netcfg'], weightsBlob, fpgaInputs[0], fpgaOutputs[0], args["batchsz"], args['quantizecfg'], args['scaleB'], args['PE']) cumulative_time += timeit.default_timer() # Summarize print("===========================================") print("Performance Summary\n") print(" Network: %s" % (args["name"])) print(" Precision: %d" % (args["precision"])) print(" Images: %d" % (args["iterations"] * args["batchsz"])) print(" Batch Size: %d" % (args["batchsz"])) print(" Total Batches: %d" % (args["iterations"])) print(" Total Time: %.2f ms" % (1000 * cumulative_time)) print(" SIL: %.2f ms" % (1000 * cumulative_time / args["iterations"])) # Time per batch # Single Image Latency print(" FPS: %.2f" % (args["iterations"] * args["batchsz"] / cumulative_time)) print(" GOPS: %.2f" % (args["ops"] * args["iterations"] * args["batchsz"] / cumulative_time / 1000000000)) print("===========================================\n") # Release FPGA xdnn.closeHandle()