def fpga_waiter(fpgaRT, output_buffers, output_node_name, fcWeight, fcBias, free_job_id_queue, occupied_job_id_queue, response_queues): """ Wait for job to finish and distribute result to workers """ try: while True: job_id, worker_id, request_id = occupied_job_id_queue.get() # Wait for FPGA to finish fpgaRT.get_result(job_id) # Read output response = output_buffers[job_id] # Compute fully connected layer fcOutput = np.empty((response[output_node_name].shape[0], 1000), dtype=np.float32, order='C') xdnn.computeFC(fcWeight, fcBias, response[output_node_name], fcOutput) # Give response to worker response_queues[worker_id].put((fcOutput.tobytes(), request_id)) # Free job ID free_job_id_queue.put(job_id) except Exception as e: import traceback import sys traceback.print_exc() sys.exit()
def main(): args = xdnn_io.processCommandLine() ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") if ret != 0: sys.exit(1) fpgaRT = xdnn.XDNNFPGAOp(handles, args) fpgaOutput = fpgaRT.getOutputs() fpgaInput = fpgaRT.getInputs() fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args) img_paths = xdnn_io.getFilePaths(args['images']) fcOutput = np.empty(( args['batch_sz'], args['outsz'], ), dtype=np.float32, order='C') inShape = (args['batch_sz'], ) + tuple( tuple(fpgaRT.getInputDescriptors().values())[0][1:]) labels = xdnn_io.get_labels(args['labels']) if args['golden']: goldenMap = xdnn_io.getGoldenMap(args['golden']) top5Count = 0 top1Count = 0 firstInput = list(fpgaInput.values())[0] firstOutput = list(fpgaOutput.values())[0] for i in range(0, len(img_paths), args['batch_sz']): pl = [] for j, p in enumerate(img_paths[i:i + args['batch_sz']]): firstInput[j, ...], _ = xdnn_io.loadImageBlobFromFile( p, args['img_raw_scale'], args['img_mean'], args['img_input_scale'], inShape[2], inShape[3]) pl.append(p) fpgaRT.execute(fpgaInput, fpgaOutput) xdnn.computeFC(fcWeight, fcBias, firstOutput, fcOutput) softmaxOut = xdnn.computeSoftmax(fcOutput) xdnn_io.printClassification(softmaxOut, pl, labels) if args['golden']: for j, p in enumerate(img_paths[i:i + args['batch_sz']]): top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1) top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5) xdnn.closeHandle() if args['golden']: print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % ( len(img_paths), float(top1Count) / float(len(img_paths)) * 100., float(top5Count) / float(len(img_paths)) * 100.)
def pop(self): # Wait for finish signal out_slot = self.out_index % self.n_streams self.fpgaRT.get_result(out_slot) # Read output response = self.output_buffers[out_slot] fcOutput = np.empty((response["fc1000/Reshape_output"].shape[0], 1000), dtype=np.float32, order='C') xdnn.computeFC(self.fcWeight, self.fcBias, response["fc1000/Reshape_output"], fcOutput) response = request_wrapper.dictToProto( {"fc1000/Reshape_output": fcOutput}) self.out_index += 1 return response
def run(self, imgList, fpgaOutput_list, fpgaOutputShape_list, shape_list): fpgaOutput = fpgaOutput_list[0] fpgaOutputShape = fpgaOutputShape_list[0] if self.numProcessed == 0: self.startTime = timeit.default_timer() self.labels = xdnn_io.get_labels(self.args['labels']) self.zmqPub = None if self.args['zmqpub']: self.zmqPub = ZmqResultPublisher(self.args['deviceID']) self.goldenMap = None if self.args['golden']: self.goldenMap = xdnn_io.getGoldenMap(self.args['golden']) self.top5Count = 0 self.top1Count = 0 self.fcOutput = np.empty(( self.args['batch_sz'], self.args['outsz'], ), dtype=np.float32, order='C') self.numProcessed += len(imgList) npout_view = fpgaOutput xdnn.computeFC(self.fcWeight, self.fcBias, npout_view, self.fcOutput) #self.streamQ.put(sId) smaxOutput = xdnn.computeSoftmax(self.fcOutput) if self.args['golden']: for i, p in enumerate(imgList): #topk = xdnn_io.getTopK( smaxOutput[i], self.labels, 1) #print imgList[i], topk self.top1Count += xdnn_io.isTopK(\ smaxOutput[i], self.goldenMap, p, self.labels, 1) self.top5Count += xdnn_io.isTopK(\ smaxOutput[i], self.goldenMap, p, self.labels, 5) if self.zmqPub is not None: predictMsg = xdnn_io.getClassification(\ smaxOutput, imgList, self.labels, zmqPub=True) self.zmqPub.send(predictMsg)
def main(argv): args = xdnn_io.processCommandLine(argv) ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0") # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib) if ret != 0: sys.exit(1) labels = xdnn_io.get_labels(args['labels']) # TODO dict of tuples instead? fpgaRT = {} fpgaOutputs = {} fcWeights = {} fcBiases = {} netFiles = {} confNames = [] args = args['jsoncfg'] # we do not use other args' keys for netconf_args in args: confName = str(netconf_args['name']) confNames += [confName] # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp']) fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args) netconf_args['in_shape'] = tuple((netconf_args['batch_sz'], ) + tuple( fpgaRT[confName].getInputDescriptors().itervalues().next()[1:])) (fcWeights[confName], fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args) fpgaOutputs[confName] = np.empty(( netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']), ), dtype=np.float32, order='C') netFiles[confName] = str(netconf_args['netcfg']) batchArrays = [] for streamId, netconf_args in enumerate(args): batchArrays.append( np.empty(netconf_args['in_shape'], dtype=np.float32, order='C')) pl = [] img_paths = xdnn_io.getFilePaths(netconf_args['images']) for j, p in enumerate(img_paths[:netconf_args['batch_sz']]): batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile( p, netconf_args['img_raw_scale'], netconf_args['img_mean'], netconf_args['img_input_scale'], netconf_args['in_shape'][2], netconf_args['in_shape'][3]) pl.append(p) confName = str(netconf_args['name']) firstInputName = fpgaRT[confName].getInputs().iterkeys().next() firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next() fpgaRT[confName].exec_async({firstInputName: batchArrays[-1]}, {firstOutputName: fpgaOutputs[confName]}, streamId) for streamId, confName in enumerate(confNames): fpgaRT[confName].get_result(streamId) for netconf_args in args: confName = str(netconf_args['name']) fcOut = np.empty((netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order='C') xdnn.computeFC(fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut) softmaxOut = xdnn.computeSoftmax(fcOut) xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels) xdnn.closeHandle()
# Step 3.6 # Execute the Fully Connected Layers on the CPU # The FPGA does not support fully connected layers # Given they are very fast with BLAS in the CPU, we leave the final layers to be executed there config["outsz"] = 102 # Number of elements output by FC layers config["useblas"] = True # Accelerate Fully Connected Layers in the CPU if len(pyxfdnn._xdnnManager._handles ) > 0: # Just make sure FPGA still available fcOut = pyxfdnn.computeFC( fcWeight, fcBias, fpgaOutputs, batch_sz, config['outsz'], config['fpgaoutsz'], config['useblas'] # Can use cblas if True or numpy if False ) # ### 11. Execute the Softmax layers # In[13]: # Compute the softmax to convert the output to a vector of probabilities softmaxOut = pyxfdnn.computeSoftmax(fcOut, batch_sz) # ### 12. Output the classification prediction scores # In[14]:
def img_classify(msg): global g_cInputBuffer global g_cFpgaInputBuffer # message is a rowset, one col, a list of file names. rs = msg.rowset if len(rs.columns) == 0 or rs.columns[0].nrow == 0: print("Img classify request size is 0.\n") return None print("Img classify request size is {0}.\n".format(rs.columns[0].nrow)) # Lock the fpga device. config is protected by this lock as well. fpga_lock.acquire() ret = None for i in range(rs.columns[0].nrow): fname = rs.columns[0].sdata[i] print("Running classification for images: {0}\n".format(fname)) print("Prepare inputs ...\n") # g_batchSize = 1, for now. config["g_inputs"][0] = pyxfdnn_io.loadImageBlobFromFile( fname, config["img_mean"], g_imgh, g_imgw) print("Quantize inputs ...\n") quantizeInputs = pyxfdnn.quantizeInputs( config["firstfpgalayer"], config["g_inputs"], g_cInputBuffer, g_cFpgaInputBuffer, config["quantizecfg"], config["scaleB"]) print("Prepare inputs for fpga inputs ...\n") fpgaInputs = pyxfdnn.prepareInputsForFpga(quantizeInputs, config["quantizecfg"], config["scaleB"], -1, config["firstfpgalayer"]) print("Run FPGA commands ...\n") pyxfdnn.execute( config["fpgacommands"], config["weightsBlob"], fpgaInputs, config["g_fpgaOutput"], g_batchSize, config["quantizecfg"], config["scaleB"] # # This is freaking insane. What is PE? # # Xilinx notebook uses PE = 0, which works for a few images then crash. # Xilinx example batch_classify.py says do not supply this PE paramenter, # then default is -1. Runs fine for many images. # # , config["PE"] # ) print("Compute FC ...\n") fcOut = pyxfdnn.computeFC(config["fcWeight"], config["fcBias"], config["g_fpgaOutput"], g_batchSize, config["outsz"], config["fpgaoutsz"], config["useblas"]) print("Softmax ...\n") softmaxOut = pyxfdnn.computeSoftmax(fcOut, g_batchSize) ret = get_classification(softmaxOut, fname, config) fpga_lock.release() # Now construct return msg if ret == None: print("Return None: ???\n") return None retmsg = xdrive_pb2.XMsg() rs = retmsg.rowset # return 4 columns, (filename, ordinal, score, class) col1 = rs.columns.add() col2 = rs.columns.add() col3 = rs.columns.add() col4 = rs.columns.add() col1.nrow = len(ret) col2.nrow = len(ret) col3.nrow = len(ret) col4.nrow = len(ret) for i in range(len(ret)): (a, b, c, d) = ret[i] # print("Return {0}, {1}, {2}, {3}.\n".format(a, b, c, d)) col1.nullmap.append(False) col1.sdata.append(a) col2.nullmap.append(False) col2.i32data.append(b) col3.nullmap.append(False) col3.f64data.append(c) col4.nullmap.append(False) col4.sdata.append(d) return retmsg