示例#1
0
def main():
  parser = xdnn_io.default_parser_args()
  parser.add_argument('--numprepproc', type=int, default=1,
                      help='number of parallel processes used to decode and quantize images')
  parser.add_argument('--numstream', type=int, default=16,
                      help='number of FPGA streams')
  parser.add_argument('--deviceID', type=int, default=0,
                      help='FPGA no. -> FPGA ID to run in case multiple FPGAs')
  parser.add_argument('--benchmarkmode', type=int, default=0,
                      help='bypass pre/post processing for benchmarking')
  args = parser.parse_args()
  args = xdnn_io.make_dict_args(args)
  ret = xdnn.createManager()
  if ret != True:
    sys.exit(1)

  sharedInputArrs = []
  fpgaOutputs = []
  compilerJSONObj = xdnn.CompilerJsonParser( args['netcfg'])
  qPrep = mp.Queue(maxsize=args['numprepproc']*10)
  qFpga = mp.Queue(maxsize=100)
  streamQ = mp.Queue(maxsize=args['numstream'])
  prepProcQ = mp.Queue(maxsize=100)
  firstOutputShape = compilerJSONObj.getOutputs().itervalues().next()
  firstInputShape = compilerJSONObj.getInputs().itervalues().next()

  for i in range( args['numstream'] ):
    fpgaOutputs.append(mp.Array(ctypes.c_float, args['batch_sz'] * np.prod( tuple(firstOutputShape[1:]) ) ))
    streamQ.put ( i )

  for i in range(100):
    bufSize = np.prod(tuple(firstInputShape))
    sharedInputArrs.append( mp.Array(ctypes.c_float, bufSize ) )
    prepProcQ.put (i)

  img_paths = xdnn_io.getFilePaths(args['images'])

  p = mp.Pool( initializer = init_prepImage, initargs = (args, qPrep, img_paths, sharedInputArrs, prepProcQ, compilerJSONObj, ), processes = args['numprepproc'])

  xdnnProc = mp.Process(target=fpga_process_async, args=(qPrep, qFpga, args, len(img_paths), sharedInputArrs,prepProcQ, streamQ, fpgaOutputs, compilerJSONObj,))
  xdnnProc.start()

  postProc = mp.Process(target=post_process, args=(qFpga, args, img_paths,streamQ, fpgaOutputs,))
  postProc.start()
  if args['perpetual']:
    while True:
      res = [p.map_async(run_prepImage, range(len(img_paths)))]
      for j in res:
        j.wait()
        del j
  else:
    p.map_async(run_prepImage, range(len(img_paths)))

  xdnnProc.join()
  postProc.join()

  p.close()
  p.join()
示例#2
0
def init_fpga():
    global g_inputs
    global g_inputbuf
    global g_fpgaOutput
    global g_weightsBlob
    global g_fcWeight
    global g_fcBias
    print(" --- INIT FPGA --- \n")
    print("xclbin: {0}.\n".format(g_xclbin))
    print("xdnnLib: {0}.\n".format(g_xdnnLib))
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        raise SystemExit("Error: xdnn createManager failed.")
    (g_fcWeight, g_fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib, g_numDevices)
    if ret:
        raise SystemExit("ERROR: Unable to create handle to FPGA")
    else:
        print("INFO: Sucessfully create handle to FPGA.")

    # magics.   See ml-suite/notebooks tutorial.   Should we overwrite PE?
    args = {
        'datadir': g_xdnnTestDataDir,
        'quantizecfg': g_fpgaCfgFile,
        'scaleA': g_scaleA,
        'scaleB': g_scaleB,
        'PE': -1,
        'netcfg': g_netFile
    }

    print(" --- load weights --- \n")
    g_weightsBlob = xdnn_io.loadWeightsBiasQuant(args)

    print(" --- read lable file --- \n")
    with open(g_lableFile, 'r') as f:
        for line in f:
            g_labelarray.append(line.strip())

    print(" --- prepare inputs --- \n")
    g_inputs = np.zeros((g_batchSize, g_img_c * g_img_h * g_img_w),
                        dtype=np.float32)
    g_inputbuf = np.zeros((g_batchSize, g_img_c, g_img_h, g_img_w),
                          dtype=np.float32)

    print "g_inputs", g_inputs

    print(" --- prepare outputs --- \n")
    g_fpgaOutput, fpgaHandle = xdnn.makeFPGAFloatArray(g_fpgaOutputSize *
                                                       g_batchSize)
示例#3
0
def prep_process(q):
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    while True:
        (inputs, inputImageFiles) = prepareImages()
        if inputs is None:
            break
        fpgaInputs = xdnn.quantizeInputs(g_firstFpgaLayerName, g_fpgaCfgFile,
                                         g_scaleB, inputs)
        q.put((fpgaInputs, inputImageFiles))

    q.put((None, None))
示例#4
0
  def __init__(self, args,q, img_paths, sharedInputArrs, prepProcQ, compJson):
    ret = xdnn.createManager()
    if ret != True:
      sys.exit(1)
    np.random.seed(123)  # for reproducibility
    self._args = args
    self._firstInputShape = compJson.getInputs().itervalues().next()
    self._q = q
    self._imgpaths = img_paths
    current = mp.current_process()
    self._procid = (int(current._identity[0]) - 1) % args['numprepproc']
    self._sharedmem = sharedInputArrs
    self._prepQ = prepProcQ

    #HWC format as this is the native format that comes out of jpeg decode
    self._meanarr = np.zeros ( (self._firstInputShape[2], self._firstInputShape[3], self._firstInputShape[1],), dtype = np.float32, order='C' )
    self._meanarr += args['img_mean']
示例#5
0
def prep_process(q, sharedInputArrs):
    global g_numImages
    global g_numProcessed

    #p_history = {}
    #p_history["y"] = []
    #p_history["t"]  = []

    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    shMemIdx = -1
    while True:
        #p_history["y"].append(1)
        #p_history["t"].append(timeit.default_timer())
        shMemIdx = (shMemIdx + 1) % len(sharedInputArrs)
        # WARNING: shared mem below is not synchronized.
        # currently relies on shared mem banks to be consumed faster
        # than the next cycle of writes can come long.
        # Be sure to add enough shared mem banks to feed FPGA.
        sharedInputArr = sharedInputArrs[shMemIdx]
        sharedNpArr = np.frombuffer(sharedInputArr, np.int16)

        if g_ldPreProcImgsDir is not None:
            (fpgaInputs, inputImageFiles) = loadNpyImages(sharedNpArr)
        else:
            (fpgaInputs, inputImageFiles) = prepareImages(sharedNpArr)

        if fpgaInputs is None:
            break
        putImages(shMemIdx, q)

        #p_history["y"].append(0)
        #p_history["t"].append(timeit.default_timer())

    #plt.plot(np.array(p_history["t"]),np.array(p_history["y"]))
    #plt.show()

    #print p_history

    q.put(None)
    g_perfProf.syncToShared()
示例#6
0
def main():
    processCommandLine()
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    #
    # Spawn the first 2 stages of our pipeline
    # Stage 1: Process JPG
    # Stage 2: Run FPGA "classify"
    qPrep = Queue(maxsize=1)
    qFpga = Queue(maxsize=1)
    prepProc = Process(target=prep_process, args=(qPrep, ))
    xdnnProc = Process(target=xdnn_process, args=(qPrep, qFpga))
    prepProc.start()
    xdnnProc.start()

    #
    # The rest of this function post-processes FPGA output:
    # 1) Compute the final FC + Softmax layers
    # 2) Print classification & accuracy
    #
    zmqPub = None
    if g_zmqPub:
        zmqPub = ZmqResultPublisher()
    goldenMap = None
    if g_goldenFile:
        goldenMap = getGoldenMap(g_goldenFile, g_labelFile)
    numProcessed = 0
    allTop1 = 0
    allTop5 = 0
    while True:
        loopTime = timeit.default_timer()
        (fpgaOutput, inputImageFiles) = qFpga.get()

        if type(fpgaOutput) == type(None) \
          and type(inputImageFiles) == type(None):
            break

        startTime = timeit.default_timer()

        fcOutput = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, g_batchSize,
                                  g_outputSize, g_fpgaOutputSize, g_useBlas)

        elapsedTime = timeit.default_timer() - startTime
        print "[time] FC (%.2f ms)" % (elapsedTime * 1000)

        startTime = timeit.default_timer()
        smaxOutput = xdnn.computeSoftmax(fcOutput, g_batchSize)
        elapsedTime = timeit.default_timer() - startTime
        #print "\nAfter Softmax (%.2f ms):" % (elapsedTime * 1000)

        numProcessed += g_batchSize

        (top1, top5) = printClassification(smaxOutput.flatten().tolist(),
                                           g_outputSize,
                                           inputImageFiles,
                                           g_labelFile,
                                           goldenMap,
                                           zmqPub=zmqPub)
        if goldenMap:
            print "Accuracy (i=%d) Top-1: %d, Top-5: %d" \
              % (numProcessed/g_batchSize, top1, top5)
        allTop1 += top1
        allTop5 += top5

        print "Num processed: %d" % numProcessed
        print "\n[time] Total loop (%.2f ms)" % (
            (timeit.default_timer() - loopTime) * 1000)

    if goldenMap and numProcessed:
        print "\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n" \
          % (numProcessed,
             float(allTop1)/float(numProcessed)*100.,
             float(allTop5)/float(numProcessed)*100.)

    prepProc.join()
    xdnnProc.join()
示例#7
0
def post_process():
    global g_numProcessed
    processCommandLine()
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    loadImages()
    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    # sharedInputArrs = rolling bank of shared memory blocks
    # -- 1 bank for each stream
    sharedInputArrs = []
    for i in range(4):
        sharedInputArrs.append(
          sharedctypes.RawArray(\
          ctypes.c_short, g_fpgaBatchSize*g_paddedImageSize))

    # Spawn the first 2 stages of our pipeline
    # Stage 1: Process JPG
    # Stage 2: Run FPGA "classify"
    qFpga = Queue(maxsize=1)
    qPrep = Queue(maxsize=1)
    qMsgFromXdnn = Queue(maxsize=1)

    # start FPGA proc first to make sure FPGA is done initializing
    xdnnProc = Process(target=xdnn_process,
                       args=(qPrep, qFpga, qMsgFromXdnn, sharedInputArrs))
    xdnnProc.start()

    # only start prep proc after FPGA xdnn proc is ready
    xdnnReady = qMsgFromXdnn.get()
    prepProc = Process(target=prep_process, args=(qPrep, sharedInputArrs))
    prepProc.start()

    #
    # The rest of this function post-processes FPGA output:
    # 1) Compute the final FC + Softmax layers
    # 2) Print classification & accuracy
    #
    zmqPub = None
    if g_zmqPub:
        zmqPub = ZmqResultPublisher()
    goldenMap = None
    if g_goldenFile:
        goldenMap = getGoldenMap(g_goldenFile)
    g_numProcessed = 0
    allTop1 = 0
    allTop5 = 0

    startTime = None
    while True:
        loopTime = timeit.default_timer() * (-1)
        fpgaOutput = getFpgaOutputs(qFpga)

        if g_numImages is not None and g_numProcessed >= g_numImages:
            break

        if type(fpgaOutput) == type(None):
            break

        inputImageFiles = []
        for i in range(g_batchSize):
            idx = (g_numProcessed + i) % len(g_allInputImageFiles)
            inputImageFiles.append(g_allInputImageFiles[idx])

        if g_bypassFC:
            fcOutput = np.zeros(g_batchSize * g_outputSize)
        else:
            fcOutput = fullyConnected(fcWeight, fcBias, fpgaOutput,
                                      g_batchSize, g_outputSize,
                                      g_fpgaOutputSize, g_useBlas)
        smaxOutput = softmax(fcOutput, g_batchSize)
        loopTime += timeit.default_timer()
        loopTime *= 1000  # ms
        g_numProcessed += g_batchSize

        if not g_bypassLoad:
            (top1, top5) = reportAccuracy(smaxOutput.flatten().tolist(),
                                          g_outputSize, inputImageFiles,
                                          g_labels, goldenMap, zmqPub, True)
            allTop1 += top1
            allTop5 += top5

        #g_perfProf.drawBars(g_batchSize, loopTime)

        if startTime == None:
            # set startTime after skipping 1st iteration
            startTime = timeit.default_timer()

    endTime = timeit.default_timer()
    elapsed = endTime - startTime
    elapsed *= 1000

    prepProc.join()
    xdnnProc.join()

    g_perfProf.syncToShared()
    g_perfProf.printSummary()

    if g_numProcessed > 1:
        numProfiled = g_numProcessed - 1  # we skipped 1 iter to flush pipe
        print("===========================================")
        print("Performance Summary\n")
        print("  Images: %d" % (g_numProcessed))
        if goldenMap is not None:
            print("  Top1: %.2f%%" % (100 * allTop1 / float(g_numProcessed)))
            print("  Top5: %.2f%%" % (100 * allTop5 / float(g_numProcessed)))
        print("  Batch Size: %d" % (g_batchSize))
        print("  Total Batches: %d" % (numProfiled / g_batchSize))
        print("  Total Time: %.2f ms" % (elapsed))
        print("  Time/Batch: %.2f ms" % (g_batchSize * elapsed / numProfiled))
        print("  Time/Image: %.2f ms" % (elapsed / numProfiled))
        print("  Images/Second: %f" % (1000 * numProfiled / elapsed))
        print("===========================================\n")
def main():
    parser = xdnn_io.default_parser_args()
    parser.add_argument(
        '--numprepproc',
        type=int,
        default=1,
        help='number of parallel processes used to decode and quantize images')
    parser.add_argument('--numstream',
                        type=int,
                        default=16,
                        help='number of FPGA streams')
    parser.add_argument(
        '--deviceID',
        type=int,
        default=0,
        help='FPGA no. -> FPGA ID to run in case multiple FPGAs')
    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)
    ret = xdnn.createManager(args['xlnxlib'])
    if ret != True:
        sys.exit(1)

    sharedInputArrs = []
    fpgaOutputs = []

    qPrep = mp.Queue(maxsize=args['numprepproc'] * 10)
    qFpga = mp.Queue(maxsize=100)
    streamQ = mp.Queue(maxsize=args['numstream'])
    prepProcQ = mp.Queue(maxsize=100)
    for i in range(args['numstream']):
        shared_arr = mp.Array(ctypes.c_float,
                              args['batch_sz'] * args['fpgaoutsz'])
        fpgaOutputs.append(shared_arr)
        streamQ.put(i)

    for i in range(100):
        bufSize = np.prod(args['in_shape'])
        sharedInputArrs.append(mp.Array(ctypes.c_float, bufSize))
        prepProcQ.put(i)

    img_paths = xdnn_io.getFilePaths(args['images'])

    p = mp.Pool(initializer=init_prepImage,
                initargs=(
                    args,
                    qPrep,
                    img_paths,
                    sharedInputArrs,
                    prepProcQ,
                ),
                processes=args['numprepproc'])

    xdnnProc = mp.Process(target=fpga_process_async,
                          args=(
                              qPrep,
                              qFpga,
                              args,
                              len(img_paths),
                              sharedInputArrs,
                              prepProcQ,
                              streamQ,
                              fpgaOutputs,
                          ))
    xdnnProc.start()

    postProc = mp.Process(target=post_process,
                          args=(
                              qFpga,
                              args,
                              img_paths,
                              streamQ,
                              fpgaOutputs,
                          ))
    postProc.start()
    if args['perpetual']:
        while True:
            res = [p.map_async(run_prepImage, range(len(img_paths)))]
            for j in res:
                j.wait()
                del j
    else:
        p.map_async(run_prepImage, range(len(img_paths)))

    xdnnProc.join()
    postProc.join()

    p.close()
    p.join()