def post_process(qFrom, args, img_paths, streamQ, fpgaOutputs):
    numProcessed = 0
    labels = xdnn_io.get_labels(args['labels'])
    zmqPub = None
    if args['zmqpub']:
        zmqPub = ZmqResultPublisher(args['deviceID'])
    goldenMap = None
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(args)
    bsz = args['batch_sz']
    fcOutput = np.empty((
        bsz,
        args['outsz'],
    ), dtype=np.float32, order='C')
    start = 0
    while True:
        (sId, img_idx) = qFrom.get()
        if numProcessed == 0:
            start = timeit.default_timer()
        if sId is None or img_idx is None:
            break

        imgList = []
        for x in np.nditer(img_idx):
            if x >= 0:
                imgList.append(img_paths[x])
                numProcessed += 1

        npout_view = np.frombuffer(fpgaOutputs[sId].get_obj(),
                                   dtype=np.float32)
        xdnn.computeFC(fcWeight, fcBias, npout_view, bsz, args['outsz'],
                       args['fpgaoutsz'], fcOutput)
        streamQ.put(sId)

        smaxOutput = xdnn.computeSoftmax(fcOutput)
        if args['golden']:
            for i, p in enumerate(imgList):
                top1Count += xdnn_io.isTopK(smaxOutput[i], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(smaxOutput[i], goldenMap, p,
                                            labels, 5)

        if zmqPub is not None:
            predictMsg = xdnn_io.getClassification(smaxOutput,
                                                   imgList,
                                                   labels,
                                                   zmqPub=True)
            zmqPub.send(predictMsg)

    print("%g images/s" % (float(numProcessed) / (time.time() - start)))

    if args['golden']:
        print ("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") \
          % (numProcessed,
             float(top1Count)/float(numProcessed)*100.,
             float(top5Count)/float(numProcessed)*100.)
예제 #2
0
def main(argv):
    args = xdnn_io.processCommandLine(argv)
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
      sys.exit(1)
    labels = xdnn_io.get_labels(args['labels'])

    # TODO dict of tuples instead?
    fpgaRT          = {}
    fpgaOutputs     = {}
    fcWeights       = {}
    fcBiases        = {}
    netFiles        = {}
    confNames       = []

    args = args['jsoncfg']      # we do not use other args' keys
    for netconf_args in args:
      
      confName   = str(netconf_args['name'])
      confNames += [confName]
      # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp'])
      fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args)
      netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) 
      (fcWeights[confName],
        fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args)
      fpgaOutputs[confName]             = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C')
      netFiles[confName]                = str(netconf_args['netcfg'])

    batchArrays = []
    for streamId, netconf_args in enumerate(args):
      batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C'))
      pl = []
      img_paths = xdnn_io.getFilePaths(netconf_args['images'])
      for j, p in enumerate(img_paths[:netconf_args['batch_sz']]):
        batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'],
                                                                  netconf_args['img_mean'],
                                                                  netconf_args['img_input_scale'],
                                                                  netconf_args['in_shape'][2],
                                                                  netconf_args['in_shape'][3])
        pl.append(p)

      confName = str(netconf_args['name'])
      firstInputName = fpgaRT[confName].getInputs().iterkeys().next()
      firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next()
      fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId)

    for streamId, confName in enumerate(confNames):
      fpgaRT[confName].get_result (streamId)

    for netconf_args in args:
      confName = str(netconf_args['name'])
      fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C')
      xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut)

      softmaxOut = xdnn.computeSoftmax(fcOut)
      xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels);

    xdnn.closeHandle()
예제 #3
0
def main():
    args = xdnn_io.processCommandLine()

    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        sys.exit(1)
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)
    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args)
    img_paths = xdnn_io.getFilePaths(args['images'])
    fpgaOutput = np.empty((
        args['batch_sz'],
        args['fpgaoutsz'],
    ),
                          dtype=np.float32,
                          order='C')
    fcOutput = np.empty((
        args['batch_sz'],
        args['outsz'],
    ),
                        dtype=np.float32,
                        order='C')
    batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']),
                           dtype=np.float32,
                           order='C')
    labels = xdnn_io.get_labels(args['labels'])
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    for i in xrange(0, len(img_paths), args['batch_sz']):
        pl = []
        for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
            batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile(
                p, args['img_raw_scale'], args['img_mean'],
                args['img_input_scale'], args['in_shape'][2],
                args['in_shape'][1])
            pl.append(p)

        fpgaRT.execute(batch_array, fpgaOutput)
        xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'],
                       args['outsz'], args['fpgaoutsz'], fcOutput)
        softmaxOut = xdnn.computeSoftmax(fcOutput)
        xdnn_io.printClassification(softmaxOut, pl, labels)
        if args['golden']:
            for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
                top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 5)

    xdnn.closeHandle()
    if args['golden']:
        print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (
            len(img_paths), float(top1Count) / float(len(img_paths)) * 100.,
            float(top5Count) / float(len(img_paths)) * 100.)
예제 #4
0
def main():
    args = xdnn_io.processCommandLine()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    if ret != 0:
        sys.exit(1)
    (weightsBlob, fcWeight, fcBias) = xdnn_io.loadWeights(args)
    (fpgaInputs, batch_sz) = xdnn_io.prepareInput(args)
    fpgaOutput = xdnn_io.prepareOutput(args['fpgaoutsz'], batch_sz)
    for i in range(1):
        startTime = timeit.default_timer()
        xdnn.execute(
            args['netcfg'],
            weightsBlob,
            fpgaInputs,
            fpgaOutput,
            batch_sz,  # num batches
            args['quantizecfg'],
            args['scaleB'],
            args['PE'])
        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter FPGA (%f ms)" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    fcOut = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, batch_sz,
                           args['outsz'], args['fpgaoutsz'], args['useblas'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter FC (%f ms)" % (elapsedTime * 1000)
    #for i in range(10):
    #  print "%f" % fpgaOutput[i],

    startTime = timeit.default_timer()
    softmaxOut = xdnn.computeSoftmax(fcOut, batch_sz)
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter Softmax (%f ms)" % (elapsedTime * 1000)

    #for i in range(10):
    #  print "%f" % fpgaOutput[i],

    xdnn_io.printClassification(softmaxOut, args)

    print "\nSuccess!\n"
    xdnn.closeHandle()
예제 #5
0
    def exec_post_fpga(self, image, streamId):
        args = copy.deepcopy(self._config)
        fpgaOutput = self._streamOutputs[streamId]
        batch_sz = 1
        fcOut = np.empty((
            batch_sz,
            args['outsz'],
        ),
                         dtype=np.float32,
                         order='C_CONTIGUOUS')
        xdnn.computeFC(self._fcWeight, self._fcBias,
                       self._streamOutputs[streamId], batch_sz, args['outsz'],
                       args['fpgaoutsz'], fcOut)

        softmaxOut = xdnn.computeSoftmax(fcOut)

        result = xdnn_io.getClassification(softmaxOut, [image], self._labels)
        result = result.strip().split("\n")
        top5 = [x for x in result if "-------" not in x]
        return top5
예제 #6
0
def main():
    args = xdnn_io.processCommandLine()

    # processCommandLine()
    startTime = timeit.default_timer()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
        sys.exit(1)
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter createHandle (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    # TODO dict of tuples instead?
    fpgaInputs = {}
    fpgaOutputs = {}
    weightsBlobs = {}
    fcWeights = {}
    fcBiases = {}
    batch_sizes = {}
    fpgaOutputSizes = {}
    PEs = {}
    netFiles = {}
    confNames = []

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        confNames.append(confName)
        # make a tuple instead
        PE = [int(x) for x in netconf_args['PE'].split()]
        # if cuMask in cuMaskList:
        #  raise Exception('cuMasks are non-disjoint')
        datadir = str(netconf_args['datadir'])
        fpgaoutsz = int(netconf_args['fpgaoutsz'])
        netfile = str(netconf_args['netcfg'])

        PEs[confName] = PE
        (weightsBlobs[confName], fcWeights[confName],
         fcBiases[confName]) = xdnn_io.loadWeights(netconf_args)
        fpgaOutputSizes[confName] = fpgaoutsz
        (fpgaInputs[confName],
         batch_sz) = xdnn_io.prepareInput(netconf_args, PE)
        batch_sizes[confName] = batch_sz
        fpgaOutputs[confName] = xdnn_io.prepareOutput(
            int(netconf_args['fpgaoutsz']), batch_sz)
        netFiles[confName] = netfile

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter init (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        xdnn.exec_async(netFiles[confName], weightsBlobs[confName],
                        fpgaInputs[confName], fpgaOutputs[confName],
                        int(batch_sizes[confName]),
                        netconf_args['quantizecfg'], netconf_args['scaleB'],
                        PEs[confName])

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter Execonly (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for confName in confNames:
        xdnn.get_result(PEs[confName])

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter wait (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        fcOut = xdnn.computeFC(fcWeights[confName], fcBiases[confName],
                               fpgaOutputs[confName], batch_sizes[confName],
                               netconf_args['outsz'],
                               netconf_args['fpgaoutsz'],
                               netconf_args['useblas'])

        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter FC (%f ms):" % (elapsedTime * 1000)
        startTime = timeit.default_timer()

        softmaxOut = xdnn.computeSoftmax(fcOut, batch_sizes[confName])

        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter Softmax (%f ms):" % (elapsedTime * 1000)

        xdnn_io.printClassification(softmaxOut, netconf_args)

    print "\nSuccess!\n"

    xdnn.closeHandle()
예제 #7
0
def main():
    processCommandLine()
    ret = xdnn.createManager(g_xdnnLib)
    if ret != True:
        sys.exit(1)

    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(g_xdnnTestDataDir)

    #
    # Spawn the first 2 stages of our pipeline
    # Stage 1: Process JPG
    # Stage 2: Run FPGA "classify"
    qPrep = Queue(maxsize=1)
    qFpga = Queue(maxsize=1)
    prepProc = Process(target=prep_process, args=(qPrep, ))
    xdnnProc = Process(target=xdnn_process, args=(qPrep, qFpga))
    prepProc.start()
    xdnnProc.start()

    #
    # The rest of this function post-processes FPGA output:
    # 1) Compute the final FC + Softmax layers
    # 2) Print classification & accuracy
    #
    zmqPub = None
    if g_zmqPub:
        zmqPub = ZmqResultPublisher()
    goldenMap = None
    if g_goldenFile:
        goldenMap = getGoldenMap(g_goldenFile, g_labelFile)
    numProcessed = 0
    allTop1 = 0
    allTop5 = 0
    while True:
        loopTime = timeit.default_timer()
        (fpgaOutput, inputImageFiles) = qFpga.get()

        if type(fpgaOutput) == type(None) \
          and type(inputImageFiles) == type(None):
            break

        startTime = timeit.default_timer()

        fcOutput = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, g_batchSize,
                                  g_outputSize, g_fpgaOutputSize, g_useBlas)

        elapsedTime = timeit.default_timer() - startTime
        print "[time] FC (%.2f ms)" % (elapsedTime * 1000)

        startTime = timeit.default_timer()
        smaxOutput = xdnn.computeSoftmax(fcOutput, g_batchSize)
        elapsedTime = timeit.default_timer() - startTime
        #print "\nAfter Softmax (%.2f ms):" % (elapsedTime * 1000)

        numProcessed += g_batchSize

        (top1, top5) = printClassification(smaxOutput.flatten().tolist(),
                                           g_outputSize,
                                           inputImageFiles,
                                           g_labelFile,
                                           goldenMap,
                                           zmqPub=zmqPub)
        if goldenMap:
            print "Accuracy (i=%d) Top-1: %d, Top-5: %d" \
              % (numProcessed/g_batchSize, top1, top5)
        allTop1 += top1
        allTop5 += top5

        print "Num processed: %d" % numProcessed
        print "\n[time] Total loop (%.2f ms)" % (
            (timeit.default_timer() - loopTime) * 1000)

    if goldenMap and numProcessed:
        print "\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n" \
          % (numProcessed,
             float(allTop1)/float(numProcessed)*100.,
             float(allTop5)/float(numProcessed)*100.)

    prepProc.join()
    xdnnProc.join()
def executeOnFPGA(sProtoBufPath, Qmode, Inference_Data, handle, name,
                  num_models):
    TOTAL_IMAGES = 128

    # Create handle for FPGA
    ret, handle = xdnn.createHandle(
        "../overlaybins/" + "aws" + "/overlay_1.xclbin", "kernelSxdnn_0")

    #Initialize objects to store results
    fpgaRT = {}
    fpgaOutput = {}
    fcWeight = {}
    fcBias = {}
    netFiles = {}
    confNames = []

    #Generate batch
    batch_array = generateRandomBatch(TOTAL_IMAGES, None)

    #Get Image batch to start inference

    for i in range(0, num_models):
        confNames += [str(i)]
        #Generate batch 10 * batchsize
        config = initializeFpgaModel(sProtoBufPath, Qmode)
        config["PE"] = i
        config["name"] = config["name"] + "_" + str(i)
        # Load weights to FPGA
        config = TransferWeightsFPGA(len(batch_array), config, handle, i)
        fpgaRT[str(i)] = xdnn.XDNNFPGAOp(handle, config)
        (fcWeight[str(i)], fcBias[str(i)]) = xdnn_io.loadFCWeightsBias(config)
        fpgaOutput[str(i)], fcOutput, config = AllocateMemoryToHost(config)

    start0 = time.time()
    # Schedule FPGA execution asynchronously
    for i in range(0, num_models):
        fpgaRT[str(i)].exec_async(batch_array, fpgaOutput[str(i)], i)

    start1 = time.time()

    #Fetch results of all parallel executions
    for i in range(0, num_models):
        #Get FPGA output
        ret = fpgaRT[str(i)].get_result(i)
        #Compute Inner product - fully connected layer
        xdnn.computeFC(fcWeight[str(i)], fcBias[str(i)], fpgaOutput[str(i)],
                       config['batch_sz'], config['outsz'],
                       config['fpgaoutsz'], fcOutput)
        #Compute output softmax
        softmaxOut = xdnn.computeSoftmax(fcOutput)

    #xdnn_io.printClassification(softmaxOut, config['images'], labels);
    end = time.time()
    print("throughput", (num_models * len(batch_array) / (end - start0)),
          "duration", end - start0)
    Inference_result = []
    #Append results
    Inference_Data.append({
        "experiment":
        str(Qmode) + "_bit_mode",
        "duration_overall":
        end - start0,
        "imgsPerSecAll":
        num_models * len(batch_array) / (end - start0),
        "num_models_parallel":
        num_models
    })
    xdnn.closeHandle()

    Inference_Data = pd.DataFrame(Inference_Data)
    #    Inference_Data.to_csv('multinet_results.csv')
    result = pd.read_csv('multinet_results.csv')
    result = result.append(Inference_Data)
    result.to_csv('multinet_results.csv')
예제 #9
0
def img_classify(msg):
    global g_inputs
    global g_inputbuf
    global g_fpgaOutput
    global g_weightsBlob
    global g_fcWeight
    global g_fcBias

    # message is a rowset, one col, a list of file names.
    rs = msg.rowset
    if len(rs.columns) == 0 or rs.columns[0].nrow == 0:
        print("Img classify request size is 0.\n")
        return None
    print("Img classify request size is {0}.\n".format(rs.columns[0].nrow))
    # Lock the fpga device.   config is protected by this lock as well.

    fpga_lock.acquire()
    ret = None

    for i in range(rs.columns[0].nrow):
        fname = rs.columns[0].sdata[i]
        print("Running classification for images: {0}\n".format(fname))
        print("Prepare inputs ...\n")

        # g_batchSize = 1, for now.
        print "g_inputs", g_inputs
        g_inputs[0] = xdnn_io.loadImageBlobFromFile(str(fname), g_mean,
                                                    g_img_h, g_img_w)

        print("Quantize inputs ...\n")
        quantizeInputs = xdnn.quantizeInputs(g_firstFpgaLayerName, g_inputs,
                                             None, None, g_fpgaCfgFile,
                                             g_scaleB)

        print("Prepare inputs for fpga inputs ...\n")
        fpgaInputs = xdnn.prepareInputsForFpga(quantizeInputs, g_fpgaCfgFile,
                                               g_scaleB, -1,
                                               g_firstFpgaLayerName)

        print("Run FPGA commands ...\n")
        xdnn.execute(g_netFile, g_weightsBlob, fpgaInputs, g_fpgaOutput,
                     g_batchSize, g_fpgaCfgFile, g_scaleB, g_PE)

        print("Compute FC ...\n")
        fcOutput = xdnn.computeFC(g_fcWeight, g_fcBias, g_fpgaOutput,
                                  g_batchSize, g_outputSize, g_fpgaOutputSize,
                                  g_useBlas)

        print("Softmax ...\n")
        softmaxOut = xdnn.computeSoftmax(fcOutput, g_batchSize)
        ret = get_classification(softmaxOut, fname)

    fpga_lock.release()

    # Now construct return msg
    if ret == None:
        print("Return None: ???\n")
        return None

    retmsg = xdrive_pb2.XMsg()
    rs = retmsg.rowset
    # return 4 columns, (filename, ordinal, score, class)
    col1 = rs.columns.add()
    col2 = rs.columns.add()
    col3 = rs.columns.add()
    col4 = rs.columns.add()
    col1.nrow = len(ret)
    col2.nrow = len(ret)
    col3.nrow = len(ret)
    col4.nrow = len(ret)

    for i in range(len(ret)):
        (a, b, c, d) = ret[i]
        # print("Return {0}, {1}, {2}, {3}.\n".format(a, b, c, d))
        col1.nullmap.append(False)
        col1.sdata.append(a)
        col2.nullmap.append(False)
        col2.i32data.append(b)
        col3.nullmap.append(False)
        col3.f64data.append(c)
        col4.nullmap.append(False)
        col4.sdata.append(d)

    return retmsg
예제 #10
0
def img_classify(msg):
    global g_args
    global g_ctxt

    # message is a rowset, one col, a list of file names.
    rs = msg.rowset
    if len(rs.columns) == 0 or rs.columns[0].nrow == 0:
        print("Img classify request size is 0.\n")
        return None
    print("Img classify request size is {0}.\n".format(rs.columns[0].nrow))
    # Lock the fpga device.   config is protected by this lock as well.

    fpga_lock.acquire()
    ret = []

    if is_deploymode():
        firstInput = g_ctxt['fpgaInput'].itervalues().next()
        firstOutput = g_ctxt['fpgaOutput'].itervalues().next()

    for i in xrange(0, rs.columns[0].nrow, g_args['batch_sz']):
        pl = []
        for j in range(g_args['batch_sz']):
            fname = str(rs.columns[0].sdata[i + j])
            print("Running classification for {0}-th images: {1}\n".format(
                i + j, fname))
            if is_deploymode():
                firstInput[j, ...], _ = xdnn_io.loadImageBlobFromFile(
                    fname, g_args['img_raw_scale'], g_args['img_mean'],
                    g_args['img_input_scale'], g_ctxt['inShape'][2],
                    g_ctxt['inShape'][3])
            else:
                g_ctxt['batch_array'][j,
                                      ...], _ = xdnn_io.loadImageBlobFromFile(
                                          fname, g_args['img_raw_scale'],
                                          g_args['img_mean'],
                                          g_args['img_input_scale'],
                                          g_ctxt['in_shape'][2],
                                          g_ctxt['in_shape'][1])
            pl.append(fname)

        if is_deploymode():
            g_ctxt['fpgaRT'].execute(g_ctxt['fpgaInput'], g_ctxt['fpgaOutput'])
            xdnn.computeFC(g_ctxt['fcWeight'], g_ctxt['fcBias'], firstOutput,
                           g_ctxt['fcOutput'])
        else:
            g_ctxt['fpgaRT'].execute(g_ctxt['batch_array'],
                                     g_ctxt['fpgaOutput'])
            xdnn.computeFC(g_ctxt['fcWeight'], g_ctxt['fcBias'],
                           g_ctxt['fpgaOutput'], g_args['batch_sz'],
                           g_args['outsz'], g_args['fpgaoutsz'],
                           g_ctxt['fcOutput'])

        softmaxOut = xdnn.computeSoftmax(g_ctxt['fcOutput'])
        ret = ret + get_classification(softmaxOut, pl, g_ctxt['labels'])

    fpga_lock.release()

    retmsg = xdrive_pb2.XMsg()
    rs = retmsg.rowset
    # return 4 columns, (filename, ordinal, score, class)
    col1 = rs.columns.add()
    col2 = rs.columns.add()
    col3 = rs.columns.add()
    col4 = rs.columns.add()
    col1.nrow = len(ret)
    col2.nrow = len(ret)
    col3.nrow = len(ret)
    col4.nrow = len(ret)

    for i in range(len(ret)):
        # print("Return {0}, {1}, {2}, {3}.\n".format(a, b, c, d))
        col1.nullmap.append(False)
        col1.sdata.append(ret[i][0])
        col2.nullmap.append(False)
        col2.i32data.append(ret[i][1])
        col3.nullmap.append(False)
        col3.f64data.append(ret[i][2])
        col4.nullmap.append(False)
        col4.sdata.append(ret[i][3])

    return retmsg
def main(argv=None):
    args = xdnn_io.processCommandLine(argv)

    startTime = timeit.default_timer()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    if ret != 0:
        sys.exit(1)
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to createHandle (%f ms):" % (elapsedTime * 1000)

    # we do not need other args keys except 'jsoncfg'
    args = args['jsoncfg']

    netCfgs = defaultdict(dict)
    confNames = []
    startTime = timeit.default_timer()
    for streamId, netCfg_args in enumerate(args):
        confName = str(netCfg_args['name'])
        confNames += [confName]

        netCfg_args['netcfg'] = './data/{}_{}.cmd'.format(
            netCfg_args['net'], netCfg_args['dsp'])
        netCfgs[confName]['streamId'] = streamId
        netCfgs[confName]['args'] = netCfg_args
        (netCfgs[confName]['weightsBlobs'], netCfgs[confName]['fcWeights'],
         netCfgs[confName]['fcBiases']) = xdnn_io.loadWeights(netCfg_args)
        netCfgs[confName]['batch_sz'] = 1
        netCfgs[confName]['fpgaOutputs'] = xdnn_io.prepareOutput(
            netCfg_args["fpgaoutsz"], netCfgs[confName]['batch_sz'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to init (%f ms):" % (elapsedTime * 1000)

    ## run YOLO
    confName = 'yolo'
    netCfg = netCfgs[confName]

    startTime = timeit.default_timer()
    (netCfg['fpgaInputs'], netCfg['batch_sz'],
     netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'],
                                              netCfg['args']['PE'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime *
                                                               1000)

    startTime = timeit.default_timer()
    xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'],
                    netCfg['fpgaInputs'], netCfg['fpgaOutputs'],
                    netCfg['batch_sz'], netCfg['args']['quantizecfg'],
                    netCfg['args']['scaleB'], netCfg['args']['PE'],
                    netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute Yolo on FPGA (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    xdnn.get_result(netCfg['args']['PE'], netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to retrieve yolo outputs from FPGA (%f ms):" % (elapsedTime *
                                                                  1000)

    startTime = timeit.default_timer()
    out_h         = \
    out_w         = netCfg['args']['in_shape'][1] / 32
    anchor_boxes = 5
    objectness = 1
    coordinates = 4
    classes = 80
    out_c = objectness + coordinates + classes

    # Reshape the fpgaOutputs into a 4D volume
    yolo_outputs = netCfg['fpgaOutputs'].reshape(anchor_boxes, out_c, out_h,
                                                 out_w)

    # Apply sigmoid to 1st, 2nd, 4th channel for all anchor boxes
    yolo_outputs[:, 0:2, :, :] = sigmoid(
        yolo_outputs[:, 0:2, :, :])  # (X,Y) Predictions
    yolo_outputs[:, 4, :, :] = sigmoid(
        yolo_outputs[:, 4, :, :])  # Objectness / Box Confidence

    # Apply softmax on the class scores foreach anchor box
    for box in range(anchor_boxes):
        yolo_outputs[box, 5:, :, :] = softmax(yolo_outputs[box, 5:, :, :])

    # Perform Non-Max Suppression
    # Non-Max Suppression filters out detections with a score lesser than 0.24
    # Additionally if there are two predections with an overlap > 30%, the prediction with the lower score will be filtered
    scorethresh = 0.24
    iouthresh = 0.3
    bboxes = nms.do_baseline_nms(yolo_outputs.flat, netCfg['shapes'][0][1],
                                 netCfg['shapes'][0][0],
                                 netCfg['args']['in_shape'][2],
                                 netCfg['args']['in_shape'][1], out_w, out_h,
                                 anchor_boxes, classes, scorethresh, iouthresh)

    with open(netCfg['args']['labels']) as f:
        namez = f.readlines()
        names = [x.strip() for x in namez]

    # Lets print the detections our model made
    for j in range(len(bboxes)):
        print("Obj %d: %s" % (j, names[bboxes[j]['classid']]))
        print("\t score = %f" % (bboxes[j]['prob']))
        print("\t (xlo,ylo) = (%d,%d)" %
              (bboxes[j]['ll']['x'], bboxes[j]['ll']['y']))
        print("\t (xhi,yhi) = (%d,%d)" %
              (bboxes[j]['ur']['x'], bboxes[j]['ur']['y']))

    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute on CPU (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()

    img = cv2.imread(netCfg['args']['images'][0])
    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # YOLO was trained with RGB, not BGR like Caffe

    # choose one of the bounding boxes
    obj_idx = 0

    # specify a margin added to the selected bounding box
    margin = 10

    H_slice = slice(max(0, bboxes[obj_idx]['ur']['y'] - margin),
                    min(img.shape[0], bboxes[obj_idx]['ll']['y'] + margin))
    W_slice = slice(max(0, bboxes[obj_idx]['ll']['x'] - margin),
                    min(img.shape[1], bboxes[obj_idx]['ur']['x'] + margin))
    img = img[H_slice, W_slice, :]

    print('pass obj {}: {} with size {} to googlenet'.format(
        obj_idx, names[bboxes[obj_idx]['classid']], img.shape))

    cv2.imwrite('cropped_yolo_output.jpg', img)
    '''
    if img.shape[-1] == 1 or img.shape[-1] == 3:
        # [H, W, C]
        old_dims = np.array(img.shape[:2], dtype=float)
    else:
        # [C, H, W]
        old_dims = np.array(img.shape[1:], dtype=float)
    '''

    ## run GOOGLENET
    confName = 'googlenet'
    netCfg = netCfgs[confName]
    '''
    new_dims = netCfg['args']['in_shape']
    if new_dims[-1] == 1 or new_dims[-1] == 3:
        # [H, W, C]
        new_dims = np.array(new_dims[:2], dtype=int)
    else:
        # [C, H, W]
        new_dims = np.array(new_dims[1:], dtype=int)

    scale_dims    = new_dims.copy()
    min_scale_idx = np.argmin(old_dims/new_dims)
    if min_scale_idx == 0:
      scale_dims[1] = scale_dims[0] * old_dims[1] / old_dims[0]
    else:
      scale_dims[0] = scale_dims[1] * old_dims[0] / old_dims[1]

    scale_dims = scale_dims.astype(int)

    # transform input image to match googlenet
    # scale the image
    print('scale image to {}'.format(scale_dims))
    img = resize_image(img, list(scale_dims))
    cv2.imwrite('rescaled_scaled.jpg', img)

    # crop the image
    crop_idxs = [np.arange(new_dims[i]) + int((scale_dims[i]-new_dims[i])/2) for i in range(2)]

    if img.shape[-1] == 1 or img.shape[-1] == 3:
        # [H, W, C]
        img = img[crop_idxs[0].reshape(-1,1), crop_idxs[1], :]
    else:
        # [C, H, W]
        img = img[:, crop_idxs[0].reshape(-1,1), crop_idxs[1]]

    print('crop image to {}'.format(img.shape))
    cv2.imwrite('rescaled_cropped.jpg', img)

    #img = np.transpose(img, (2, 0, 1))
    #cv2.imwrite('rescaled_transposed.jpg', img)
    '''

    netCfg['args']['images'] = [img]
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to prepare googlenet image on CPU (%f ms):" % (elapsedTime *
                                                                 1000)

    startTime = timeit.default_timer()
    (netCfg['fpgaInputs'], netCfg['batch_sz'],
     netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'],
                                              netCfg['args']['PE'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime *
                                                               1000)

    startTime = timeit.default_timer()
    xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'],
                    netCfg['fpgaInputs'], netCfg['fpgaOutputs'],
                    netCfg['batch_sz'], netCfg['args']['quantizecfg'],
                    netCfg['args']['scaleB'], netCfg['args']['PE'],
                    netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute googlenet on FPGA (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    xdnn.get_result(netCfg['args']['PE'], netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to retrieve googlenet outputs from FPGA (%f ms):" % (
        elapsedTime * 1000)

    startTime = timeit.default_timer()
    fcOut = np.empty((netCfg['batch_sz'] * netCfg['args']['outsz']),
                     dtype=np.float32,
                     order='C')
    xdnn.computeFC(netCfg['fcWeights'], netCfg['fcBiases'],
                   netCfg['fpgaOutputs'], netCfg['batch_sz'],
                   netCfg['args']['outsz'], netCfg['args']['fpgaoutsz'], fcOut)
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to run FC layers on CPU (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    softmaxOut = xdnn.computeSoftmax(fcOut, netCfg['batch_sz'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to run Softmax on CPU (%f ms):" % (elapsedTime * 1000)

    xdnn_io.printClassification(softmaxOut, netCfg['args'])

    print "\nSuccess!\n"

    xdnn.closeHandle()
예제 #12
0
def softmax(fcOutput, g_batchSize):
    return xdnn.computeSoftmax(fcOutput, g_batchSize)