Пример #1
0
def main(argv):
    args = xdnn_io.processCommandLine(argv)
    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
      sys.exit(1)
    labels = xdnn_io.get_labels(args['labels'])

    # TODO dict of tuples instead?
    fpgaRT          = {}
    fpgaOutputs     = {}
    fcWeights       = {}
    fcBiases        = {}
    netFiles        = {}
    confNames       = []

    args = args['jsoncfg']      # we do not use other args' keys
    for netconf_args in args:
      
      confName   = str(netconf_args['name'])
      confNames += [confName]
      # netconf_args['netcfg'] = './data/{}_{}.json'.format(netconf_args['net'], netconf_args['dsp'])
      fpgaRT[confName] = xdnn.XDNNFPGAOp(handles, netconf_args)
      netconf_args['in_shape'] = tuple((netconf_args['batch_sz'],) + tuple(fpgaRT[confName].getInputDescriptors().itervalues().next()[1:] )) 
      (fcWeights[confName],
        fcBiases[confName]) = xdnn_io.loadFCWeightsBias(netconf_args)
      fpgaOutputs[confName]             = np.empty ((netconf_args['batch_sz'], int(netconf_args['fpgaoutsz']),), dtype=np.float32, order='C')
      netFiles[confName]                = str(netconf_args['netcfg'])

    batchArrays = []
    for streamId, netconf_args in enumerate(args):
      batchArrays.append(np.empty(netconf_args['in_shape'], dtype=np.float32, order='C'))
      pl = []
      img_paths = xdnn_io.getFilePaths(netconf_args['images'])
      for j, p in enumerate(img_paths[:netconf_args['batch_sz']]):
        batchArrays[-1][j, ...], _ = xdnn_io.loadImageBlobFromFile(p, netconf_args['img_raw_scale'],
                                                                  netconf_args['img_mean'],
                                                                  netconf_args['img_input_scale'],
                                                                  netconf_args['in_shape'][2],
                                                                  netconf_args['in_shape'][3])
        pl.append(p)

      confName = str(netconf_args['name'])
      firstInputName = fpgaRT[confName].getInputs().iterkeys().next()
      firstOutputName = fpgaRT[confName].getOutputs().iterkeys().next()
      fpgaRT[confName].exec_async({ firstInputName : batchArrays[-1] }, { firstOutputName : fpgaOutputs[confName] }, streamId)

    for streamId, confName in enumerate(confNames):
      fpgaRT[confName].get_result (streamId)

    for netconf_args in args:
      confName = str(netconf_args['name'])
      fcOut = np.empty( (netconf_args['batch_sz'], netconf_args['outsz']), dtype=np.float32, order = 'C')
      xdnn.computeFC (fcWeights[confName], fcBiases[confName], fpgaOutputs[confName], fcOut)

      softmaxOut = xdnn.computeSoftmax(fcOut)
      xdnn_io.printClassification(softmaxOut, netconf_args['images'], labels);

    xdnn.closeHandle()
Пример #2
0
def init_fpga():
    # Instead of using command line, we hard code it here.
    # Typing correct args is almost impossible so either do it in .sh or .py
    #
    global g_args
    global g_ctxt
    print(" --- INIT FPGA --- \n")
    xdnnArgs = build_xdnn_args()
    print(xdnnArgs)
    g_args = xdnn_io.processCommandLine(xdnnArgs)
    print(" --- After parsing --- \n")
    print(g_args)

    print(" --- Create handle --- \n")
    ret, handles = xdnn.createHandle(g_args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        print(" --- !!! FAILED: Cannot create handle. --- \n")
        sys.exit(1)

    print(" --- Create fpgaRT --- \n")
    fpgaRT = xdnn.XDNNFPGAOp(handles, g_args)
    g_ctxt["fpgaRT"] = fpgaRT

    print(" --- Weight and Bias --- \n")
    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(g_args)
    g_ctxt["fcWeight"] = fcWeight
    g_ctxt["fcBias"] = fcBias

    print(" --- Init input input/output area --- \n")
    if is_deploymode():
        g_ctxt['fpgaOutput'] = fpgaRT.getOutputs()
        g_ctxt['fpgaInput'] = fpgaRT.getInputs()
        g_ctxt['inShape'] = (g_args['batch_sz'], ) + tuple(
            fpgaRT.getInputDescriptors().itervalues().next()[1:])
    else:
        g_ctxt['fpgaOutput'] = np.empty((
            g_args['batch_sz'],
            g_args['fpgaoutsz'],
        ),
                                        dtype=np.float32,
                                        order='C')
        g_ctxt['batch_array'] = np.empty(
            ((g_args['batch_sz'], ) + g_args['in_shape']),
            dtype=np.float32,
            order='C')

    g_ctxt['fcOutput'] = np.empty((
        g_args['batch_sz'],
        g_args['outsz'],
    ),
                                  dtype=np.float32,
                                  order='C')

    print(" --- Get lables --- \n")
    g_ctxt['labels'] = xdnn_io.get_labels(g_args['labels'])
    # golden?   What is that?
    # Seems we are done.

    print(" --- FPGA INITIALIZED! ---\n")
Пример #3
0
def main():
    args = xdnn_io.processCommandLine()

    ret, handles = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0")
    if ret != 0:
        sys.exit(1)
    fpgaRT = xdnn.XDNNFPGAOp(handles, args)
    fcWeight, fcBias = xdnn_io.loadFCWeightsBias(args)
    img_paths = xdnn_io.getFilePaths(args['images'])
    fpgaOutput = np.empty((
        args['batch_sz'],
        args['fpgaoutsz'],
    ),
                          dtype=np.float32,
                          order='C')
    fcOutput = np.empty((
        args['batch_sz'],
        args['outsz'],
    ),
                        dtype=np.float32,
                        order='C')
    batch_array = np.empty(((args['batch_sz'], ) + args['in_shape']),
                           dtype=np.float32,
                           order='C')
    labels = xdnn_io.get_labels(args['labels'])
    if args['golden']:
        goldenMap = xdnn_io.getGoldenMap(args['golden'])
        top5Count = 0
        top1Count = 0

    for i in xrange(0, len(img_paths), args['batch_sz']):
        pl = []
        for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
            batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile(
                p, args['img_raw_scale'], args['img_mean'],
                args['img_input_scale'], args['in_shape'][2],
                args['in_shape'][1])
            pl.append(p)

        fpgaRT.execute(batch_array, fpgaOutput)
        xdnn.computeFC(fcWeight, fcBias, fpgaOutput, args['batch_sz'],
                       args['outsz'], args['fpgaoutsz'], fcOutput)
        softmaxOut = xdnn.computeSoftmax(fcOutput)
        xdnn_io.printClassification(softmaxOut, pl, labels)
        if args['golden']:
            for j, p in enumerate(img_paths[i:i + args['batch_sz']]):
                top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 1)
                top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p,
                                            labels, 5)

    xdnn.closeHandle()
    if args['golden']:
        print("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (
            len(img_paths), float(top1Count) / float(len(img_paths)) * 100.,
            float(top5Count) / float(len(img_paths)) * 100.)
Пример #4
0
    def __init__(self, maxNumStreams):
        self._maxNumStreams = maxNumStreams
        self._streamsAvailable = []
        self._streamInputs = []
        self._streamOutputs = []

        self._config = xdnn_io.processCommandLine()
        ret, handles = xdnn.createHandle(self._config['xclbin'])
        if ret != 0:
            sys.exit(1)

        self._fpgaRT = xdnn.XDNNFPGAOp(handles, self._config)
        self._fcWeight, self._fcBias = xdnn_io.loadFCWeightsBias(self._config)
        self._labels = xdnn_io.get_labels(self._config['labels'])

        for i in range(maxNumStreams):
            self._streamsAvailable.append(i)
            self._streamInputs.append(None)
            self._streamOutputs.append(None)
Пример #5
0
def main():
    args = xdnn_io.processCommandLine()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    if ret != 0:
        sys.exit(1)
    (weightsBlob, fcWeight, fcBias) = xdnn_io.loadWeights(args)
    (fpgaInputs, batch_sz) = xdnn_io.prepareInput(args)
    fpgaOutput = xdnn_io.prepareOutput(args['fpgaoutsz'], batch_sz)
    for i in range(1):
        startTime = timeit.default_timer()
        xdnn.execute(
            args['netcfg'],
            weightsBlob,
            fpgaInputs,
            fpgaOutput,
            batch_sz,  # num batches
            args['quantizecfg'],
            args['scaleB'],
            args['PE'])
        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter FPGA (%f ms)" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    fcOut = xdnn.computeFC(fcWeight, fcBias, fpgaOutput, batch_sz,
                           args['outsz'], args['fpgaoutsz'], args['useblas'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter FC (%f ms)" % (elapsedTime * 1000)
    #for i in range(10):
    #  print "%f" % fpgaOutput[i],

    startTime = timeit.default_timer()
    softmaxOut = xdnn.computeSoftmax(fcOut, batch_sz)
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter Softmax (%f ms)" % (elapsedTime * 1000)

    #for i in range(10):
    #  print "%f" % fpgaOutput[i],

    xdnn_io.printClassification(softmaxOut, args)

    print "\nSuccess!\n"
    xdnn.closeHandle()
Пример #6
0
def main():
    args = xdnn_io.processCommandLine()

    # processCommandLine()
    startTime = timeit.default_timer()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    # ret = xdnn.createHandle(g_xclbin, "kernelSxdnn_0", g_xdnnLib)
    if ret != 0:
        sys.exit(1)
    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter createHandle (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    # TODO dict of tuples instead?
    fpgaInputs = {}
    fpgaOutputs = {}
    weightsBlobs = {}
    fcWeights = {}
    fcBiases = {}
    batch_sizes = {}
    fpgaOutputSizes = {}
    PEs = {}
    netFiles = {}
    confNames = []

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        confNames.append(confName)
        # make a tuple instead
        PE = [int(x) for x in netconf_args['PE'].split()]
        # if cuMask in cuMaskList:
        #  raise Exception('cuMasks are non-disjoint')
        datadir = str(netconf_args['datadir'])
        fpgaoutsz = int(netconf_args['fpgaoutsz'])
        netfile = str(netconf_args['netcfg'])

        PEs[confName] = PE
        (weightsBlobs[confName], fcWeights[confName],
         fcBiases[confName]) = xdnn_io.loadWeights(netconf_args)
        fpgaOutputSizes[confName] = fpgaoutsz
        (fpgaInputs[confName],
         batch_sz) = xdnn_io.prepareInput(netconf_args, PE)
        batch_sizes[confName] = batch_sz
        fpgaOutputs[confName] = xdnn_io.prepareOutput(
            int(netconf_args['fpgaoutsz']), batch_sz)
        netFiles[confName] = netfile

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter init (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        xdnn.exec_async(netFiles[confName], weightsBlobs[confName],
                        fpgaInputs[confName], fpgaOutputs[confName],
                        int(batch_sizes[confName]),
                        netconf_args['quantizecfg'], netconf_args['scaleB'],
                        PEs[confName])

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter Execonly (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for confName in confNames:
        xdnn.get_result(PEs[confName])

    elapsedTime = timeit.default_timer() - startTime
    print "\nAfter wait (%f ms):" % (elapsedTime * 1000)
    startTime = timeit.default_timer()

    for netconf_args in args['jsoncfg']:
        confName = str(netconf_args['name'])
        fcOut = xdnn.computeFC(fcWeights[confName], fcBiases[confName],
                               fpgaOutputs[confName], batch_sizes[confName],
                               netconf_args['outsz'],
                               netconf_args['fpgaoutsz'],
                               netconf_args['useblas'])

        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter FC (%f ms):" % (elapsedTime * 1000)
        startTime = timeit.default_timer()

        softmaxOut = xdnn.computeSoftmax(fcOut, batch_sizes[confName])

        elapsedTime = timeit.default_timer() - startTime
        print "\nAfter Softmax (%f ms):" % (elapsedTime * 1000)

        xdnn_io.printClassification(softmaxOut, netconf_args)

    print "\nSuccess!\n"

    xdnn.closeHandle()
def main(argv=None):
    args = xdnn_io.processCommandLine(argv)

    startTime = timeit.default_timer()
    ret = xdnn.createHandle(args['xclbin'], "kernelSxdnn_0", args['xlnxlib'])
    if ret != 0:
        sys.exit(1)
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to createHandle (%f ms):" % (elapsedTime * 1000)

    # we do not need other args keys except 'jsoncfg'
    args = args['jsoncfg']

    netCfgs = defaultdict(dict)
    confNames = []
    startTime = timeit.default_timer()
    for streamId, netCfg_args in enumerate(args):
        confName = str(netCfg_args['name'])
        confNames += [confName]

        netCfg_args['netcfg'] = './data/{}_{}.cmd'.format(
            netCfg_args['net'], netCfg_args['dsp'])
        netCfgs[confName]['streamId'] = streamId
        netCfgs[confName]['args'] = netCfg_args
        (netCfgs[confName]['weightsBlobs'], netCfgs[confName]['fcWeights'],
         netCfgs[confName]['fcBiases']) = xdnn_io.loadWeights(netCfg_args)
        netCfgs[confName]['batch_sz'] = 1
        netCfgs[confName]['fpgaOutputs'] = xdnn_io.prepareOutput(
            netCfg_args["fpgaoutsz"], netCfgs[confName]['batch_sz'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to init (%f ms):" % (elapsedTime * 1000)

    ## run YOLO
    confName = 'yolo'
    netCfg = netCfgs[confName]

    startTime = timeit.default_timer()
    (netCfg['fpgaInputs'], netCfg['batch_sz'],
     netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'],
                                              netCfg['args']['PE'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime *
                                                               1000)

    startTime = timeit.default_timer()
    xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'],
                    netCfg['fpgaInputs'], netCfg['fpgaOutputs'],
                    netCfg['batch_sz'], netCfg['args']['quantizecfg'],
                    netCfg['args']['scaleB'], netCfg['args']['PE'],
                    netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute Yolo on FPGA (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    xdnn.get_result(netCfg['args']['PE'], netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to retrieve yolo outputs from FPGA (%f ms):" % (elapsedTime *
                                                                  1000)

    startTime = timeit.default_timer()
    out_h         = \
    out_w         = netCfg['args']['in_shape'][1] / 32
    anchor_boxes = 5
    objectness = 1
    coordinates = 4
    classes = 80
    out_c = objectness + coordinates + classes

    # Reshape the fpgaOutputs into a 4D volume
    yolo_outputs = netCfg['fpgaOutputs'].reshape(anchor_boxes, out_c, out_h,
                                                 out_w)

    # Apply sigmoid to 1st, 2nd, 4th channel for all anchor boxes
    yolo_outputs[:, 0:2, :, :] = sigmoid(
        yolo_outputs[:, 0:2, :, :])  # (X,Y) Predictions
    yolo_outputs[:, 4, :, :] = sigmoid(
        yolo_outputs[:, 4, :, :])  # Objectness / Box Confidence

    # Apply softmax on the class scores foreach anchor box
    for box in range(anchor_boxes):
        yolo_outputs[box, 5:, :, :] = softmax(yolo_outputs[box, 5:, :, :])

    # Perform Non-Max Suppression
    # Non-Max Suppression filters out detections with a score lesser than 0.24
    # Additionally if there are two predections with an overlap > 30%, the prediction with the lower score will be filtered
    scorethresh = 0.24
    iouthresh = 0.3
    bboxes = nms.do_baseline_nms(yolo_outputs.flat, netCfg['shapes'][0][1],
                                 netCfg['shapes'][0][0],
                                 netCfg['args']['in_shape'][2],
                                 netCfg['args']['in_shape'][1], out_w, out_h,
                                 anchor_boxes, classes, scorethresh, iouthresh)

    with open(netCfg['args']['labels']) as f:
        namez = f.readlines()
        names = [x.strip() for x in namez]

    # Lets print the detections our model made
    for j in range(len(bboxes)):
        print("Obj %d: %s" % (j, names[bboxes[j]['classid']]))
        print("\t score = %f" % (bboxes[j]['prob']))
        print("\t (xlo,ylo) = (%d,%d)" %
              (bboxes[j]['ll']['x'], bboxes[j]['ll']['y']))
        print("\t (xhi,yhi) = (%d,%d)" %
              (bboxes[j]['ur']['x'], bboxes[j]['ur']['y']))

    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute on CPU (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()

    img = cv2.imread(netCfg['args']['images'][0])
    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # YOLO was trained with RGB, not BGR like Caffe

    # choose one of the bounding boxes
    obj_idx = 0

    # specify a margin added to the selected bounding box
    margin = 10

    H_slice = slice(max(0, bboxes[obj_idx]['ur']['y'] - margin),
                    min(img.shape[0], bboxes[obj_idx]['ll']['y'] + margin))
    W_slice = slice(max(0, bboxes[obj_idx]['ll']['x'] - margin),
                    min(img.shape[1], bboxes[obj_idx]['ur']['x'] + margin))
    img = img[H_slice, W_slice, :]

    print('pass obj {}: {} with size {} to googlenet'.format(
        obj_idx, names[bboxes[obj_idx]['classid']], img.shape))

    cv2.imwrite('cropped_yolo_output.jpg', img)
    '''
    if img.shape[-1] == 1 or img.shape[-1] == 3:
        # [H, W, C]
        old_dims = np.array(img.shape[:2], dtype=float)
    else:
        # [C, H, W]
        old_dims = np.array(img.shape[1:], dtype=float)
    '''

    ## run GOOGLENET
    confName = 'googlenet'
    netCfg = netCfgs[confName]
    '''
    new_dims = netCfg['args']['in_shape']
    if new_dims[-1] == 1 or new_dims[-1] == 3:
        # [H, W, C]
        new_dims = np.array(new_dims[:2], dtype=int)
    else:
        # [C, H, W]
        new_dims = np.array(new_dims[1:], dtype=int)

    scale_dims    = new_dims.copy()
    min_scale_idx = np.argmin(old_dims/new_dims)
    if min_scale_idx == 0:
      scale_dims[1] = scale_dims[0] * old_dims[1] / old_dims[0]
    else:
      scale_dims[0] = scale_dims[1] * old_dims[0] / old_dims[1]

    scale_dims = scale_dims.astype(int)

    # transform input image to match googlenet
    # scale the image
    print('scale image to {}'.format(scale_dims))
    img = resize_image(img, list(scale_dims))
    cv2.imwrite('rescaled_scaled.jpg', img)

    # crop the image
    crop_idxs = [np.arange(new_dims[i]) + int((scale_dims[i]-new_dims[i])/2) for i in range(2)]

    if img.shape[-1] == 1 or img.shape[-1] == 3:
        # [H, W, C]
        img = img[crop_idxs[0].reshape(-1,1), crop_idxs[1], :]
    else:
        # [C, H, W]
        img = img[:, crop_idxs[0].reshape(-1,1), crop_idxs[1]]

    print('crop image to {}'.format(img.shape))
    cv2.imwrite('rescaled_cropped.jpg', img)

    #img = np.transpose(img, (2, 0, 1))
    #cv2.imwrite('rescaled_transposed.jpg', img)
    '''

    netCfg['args']['images'] = [img]
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to prepare googlenet image on CPU (%f ms):" % (elapsedTime *
                                                                 1000)

    startTime = timeit.default_timer()
    (netCfg['fpgaInputs'], netCfg['batch_sz'],
     netCfg['shapes']) = xdnn_io.prepareInput(netCfg['args'],
                                              netCfg['args']['PE'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to transfer input image to FPGA (%f ms):" % (elapsedTime *
                                                               1000)

    startTime = timeit.default_timer()
    xdnn.exec_async(netCfg['args']['netcfg'], netCfg['weightsBlobs'],
                    netCfg['fpgaInputs'], netCfg['fpgaOutputs'],
                    netCfg['batch_sz'], netCfg['args']['quantizecfg'],
                    netCfg['args']['scaleB'], netCfg['args']['PE'],
                    netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to execute googlenet on FPGA (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    xdnn.get_result(netCfg['args']['PE'], netCfg['streamId'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to retrieve googlenet outputs from FPGA (%f ms):" % (
        elapsedTime * 1000)

    startTime = timeit.default_timer()
    fcOut = np.empty((netCfg['batch_sz'] * netCfg['args']['outsz']),
                     dtype=np.float32,
                     order='C')
    xdnn.computeFC(netCfg['fcWeights'], netCfg['fcBiases'],
                   netCfg['fpgaOutputs'], netCfg['batch_sz'],
                   netCfg['args']['outsz'], netCfg['args']['fpgaoutsz'], fcOut)
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to run FC layers on CPU (%f ms):" % (elapsedTime * 1000)

    startTime = timeit.default_timer()
    softmaxOut = xdnn.computeSoftmax(fcOut, netCfg['batch_sz'])
    elapsedTime = timeit.default_timer() - startTime
    print "\nTime to run Softmax on CPU (%f ms):" % (elapsedTime * 1000)

    xdnn_io.printClassification(softmaxOut, netCfg['args'])

    print "\nSuccess!\n"

    xdnn.closeHandle()
Пример #8
0
def benchmark():

    mode = "Non-Blocking"
    #mode = "Blocking"

    # Extract Arguments from json
    args = xdnn_io.processCommandLine()["jsoncfg"][0]

    if "platform" in args:
        args["xclbin"] = "../../overlaybins/" + str(
            args["platform"]) + "/" + args["xclbin"]

    # Establish Communication w/ FPGA
    if xdnn.createHandle(args['xclbin'], libFile=args['xlnxlib']):
        sys.exit(1)

    # Transfer weights to device memory
    if "usexdnnv3" in args and args["usexdnnv3"] == "1":
        weightsBlob = xdnn_io.loadWeightsBiasQuantv3(args)
    else:
        weightsBlob = xdnn_io.loadWeightsBiasQuant(args)

    # Create random input data
    fpgaInputs = []
    fpgaInputs.append(
        np.float32(
            np.random.standard_normal(
                (args["batchsz"], reduce(mul, args["in_shape"], 1)))))
    fpgaInputs[0] = xdnn.quantizeInputs(args["firstfpgalayer"],
                                        args["quantizecfg"], args["scaleB"],
                                        fpgaInputs[0])
    fpgaInputs[0] = xdnn.prepareInputsForFpga(fpgaInputs[0],
                                              args["quantizecfg"],
                                              args["scaleB"], -1,
                                              args["firstfpgalayer"], 0)
    fpgaInputs.append(
        np.float32(
            np.random.standard_normal(
                (args["batchsz"], reduce(mul, args["in_shape"], 1)))))
    fpgaInputs[1] = xdnn.quantizeInputs(args["firstfpgalayer"],
                                        args["quantizecfg"], args["scaleB"],
                                        fpgaInputs[1])
    fpgaInputs[1] = xdnn.prepareInputsForFpga(fpgaInputs[1],
                                              args["quantizecfg"],
                                              args["scaleB"], -1,
                                              args["firstfpgalayer"], 1)

    # Create buffers in host memory for result
    fpgaOutputs = []
    fpgaOutputs.append(
        xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"]))
    fpgaOutputs.append(
        xdnn_io.prepareOutput(args['fpgaoutsz'], args["batchsz"]))

    # Load network schedule to accelerator
    xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"],
                    args['quantizecfg'], args['scaleB'], args['PE'], 0)
    xdnn.initScript(args['netcfg'], weightsBlob, args["batchsz"],
                    args['quantizecfg'], args['scaleB'], args['PE'], 1)

    # Run forward propagation N times
    print("Running inference...\n")
    cumulative_time = -1 * timeit.default_timer()

    if mode == "Non-Blocking":

        xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0],
                        fpgaOutputs[0], args["batchsz"], args['quantizecfg'],
                        args['scaleB'], args['PE'], 0)
        xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1],
                        fpgaOutputs[1], args["batchsz"], args['quantizecfg'],
                        args['scaleB'], args['PE'], 1)

        for i in range(args["iterations"] / 2 - 1):
            xdnn.get_result(-1, 0)  # get 0
            xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[0],
                            fpgaOutputs[0], args["batchsz"],
                            args['quantizecfg'], args['scaleB'], args['PE'],
                            0)  # push 0
            xdnn.get_result(-1, 1)  # get 1
            xdnn.exec_async(args['netcfg'], weightsBlob, fpgaInputs[1],
                            fpgaOutputs[1], args["batchsz"],
                            args['quantizecfg'], args['scaleB'], args['PE'],
                            1)  # push 1

        xdnn.get_result(-1, 0)  # get 0
        xdnn.get_result(-1, 1)  # get 1

    else:
        for i in range(args["iterations"]):
            xdnn.execute(args['netcfg'], weightsBlob, fpgaInputs[0],
                         fpgaOutputs[0], args["batchsz"], args['quantizecfg'],
                         args['scaleB'], args['PE'])

    cumulative_time += timeit.default_timer()

    # Summarize
    print("===========================================")
    print("Performance Summary\n")
    print("  Network: %s" % (args["name"]))
    print("  Precision: %d" % (args["precision"]))
    print("  Images: %d" % (args["iterations"] * args["batchsz"]))
    print("  Batch Size: %d" % (args["batchsz"]))
    print("  Total Batches: %d" % (args["iterations"]))
    print("  Total Time: %.2f ms" % (1000 * cumulative_time))
    print("  SIL: %.2f ms" %
          (1000 * cumulative_time /
           args["iterations"]))  # Time per batch # Single Image Latency
    print("  FPS: %.2f" %
          (args["iterations"] * args["batchsz"] / cumulative_time))
    print("  GOPS: %.2f" % (args["ops"] * args["iterations"] *
                            args["batchsz"] / cumulative_time / 1000000000))
    print("===========================================\n")

    # Release FPGA
    xdnn.closeHandle()