예제 #1
0
def main(args=None):
    os.environ['LS_BIND_NOW'] = "1"
    args = xdnn_io.processCommandLine()
    images = xdnn_io.getFilePaths(args['images'])

    # spawn dispatcher
    dispatcher = Dispatcher(args['vitis_rundir'], g_nFPGA, g_nDispatchers,
                            args['batch_sz'])
    inshape = dispatcher.inshape

    # send work to system
    work = []
    for qIdx in range(g_nQueries):
        idx = qIdx * inshape[0]
        workBatch = [
            images[(idx + i) % len(images)] for i in range(inshape[0])
        ]
        work.append((qIdx, workBatch, (args['img_raw_scale'], args['img_mean'],
                                       args['img_input_scale'])))

    startTime = timeit.default_timer()
    dispatcher.run(work)
    del dispatcher
    t = timeit.default_timer() - startTime

    print("Queries: %d, Elapsed: %.2fs, QPS: %.2f, FPS: %.2f" \
      % (g_nQueries, t, g_nQueries / t, g_nQueries * inshape[0] / t))
    sys.stdout.flush()
예제 #2
0
def main():
  args = xdnn_io.processCommandLine()

  runner = Runner(args['vitis_rundir'])
  inTensors = runner.get_input_tensors()
  outTensors = runner.get_output_tensors()
  batch_sz = args['batch_sz']
  if batch_sz == -1:
    # use Runner's suggested batch size
    batch_sz = inTensors[0].dims[0]

  if args['golden']:
    goldenMap = xdnn_io.getGoldenMap(args['golden'])
    top5Count = 0
    top1Count = 0

  fpgaBlobs = []
  for io in [inTensors, outTensors]:
    blobs = []
    for t in io:
      shape = (batch_sz,) + tuple([t.dims[i] for i in range(t.ndims)][1:])
      blobs.append(np.empty((shape), dtype=np.float32, order='C'))
    fpgaBlobs.append(blobs)

  img_paths = xdnn_io.getFilePaths(args['images'])
  labels = xdnn_io.get_labels(args['labels'])
  xdnnCPUOp = xdnn.XDNNCPUOp("%s/weights.h5" % args['vitis_rundir'])
  fcOutput = np.empty((batch_sz, args['outsz'],), dtype=np.float32, order='C')

  fpgaInput = fpgaBlobs[0][0]
  for i in range(0, len(img_paths), batch_sz):
    pl = []
    # fill tensor input data from image file
    for j, p in enumerate(img_paths[i:i + batch_sz]):
      img, _ = xdnn_io.loadImageBlobFromFile(p,
        args['img_raw_scale'], args['img_mean'], args['img_input_scale'],
        fpgaInput.shape[2], fpgaInput.shape[3])
      pl.append(p)
      np.copyto(fpgaInput[j], img)

    jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1])
    runner.wait(jid)

    xdnnCPUOp.computeFC(fpgaBlobs[1][0], fcOutput)
    softmaxOut = xdnnCPUOp.computeSoftmax(fcOutput)
    if args['golden']:
      for j,p in enumerate(img_paths[i:i + batch_sz]):
        top1Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 1)
        top5Count += xdnn_io.isTopK(softmaxOut[j], goldenMap, p, labels, 5)
    else:
      xdnn_io.printClassification(softmaxOut, pl, labels)

  if args['golden']:
    print ( ("\nAverage accuracy (n=%d) Top-1: %.1f%%, Top-5: %.1f%%\n") % (len(img_paths), float(top1Count)/float(len(img_paths))*100., float(top5Count)/float(len(img_paths))*100.) )
예제 #3
0
def pre_process(q, args):

    xclbin_p = str(args['xclbin'] + "/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin")
    kernelName_p = "pp_pipeline_accel"
    deviceIdx_p = args['deviceid']
    fpga_pp = waa_rt.PreProcess(xclbin_p, kernelName_p, deviceIdx_p, 1)

    batch_sz = args['batch_sz']
    img_paths = xdnn_io.getFilePaths(args['images'])
    for i in range(0, len(img_paths), batch_sz):
        for j, p in enumerate(img_paths[i:i + batch_sz]):
            arr, ht = fpga_pp.preprocess_input(p)
            q.put(arr)
예제 #4
0
def pre_process(q,args):

  xclbin_p=str(args['xclbin']+"/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin")
  kernelName_p="pp_pipeline_accel"
  deviceIdx_p=args['deviceid']
  fpga_pp = xplusml.PreProcess(xclbin_p,kernelName_p,deviceIdx_p)
  batch_sz = args['batch_sz']
  img_paths = xdnn_io.getFilePaths(args['images'])
  print("Pre-processing handle created. Populating Queue")
  for i in range(0, len(img_paths), batch_sz):
    for j, p in enumerate(img_paths[i:i + batch_sz]):
      arr, ht = fpga_pp.preprocess_input(p)
      q.put(arr)
      print("Queue populated")
예제 #5
0
def pre_process(q_img, q_shape,args):

  xclbin_p=str(args['xclbin']+"/xdnn_v3_96x16_2pe_8b_9mb_bank03.xclbin")
  kernelName_p="pp_pipeline_accel"
  deviceIdx_p=args['deviceid']
  handle_p = waa_rt.PreProcess(xclbin_p,kernelName_p,deviceIdx_p, 1)
  if handle_p == -1:
    print("Unable to Create handle for pre-processing kernel. Only U200 device is supported")
    sys.exit()
  batch_sz = args['batch_sz']
  img_paths = xdnn_io.getFilePaths(args['images'])
  print("Pre-processing handle created. Populating Queue")
  for i in range(0, len(img_paths), batch_sz):
    for j, p in enumerate(img_paths[i:i + batch_sz]):
      arr, shape = handle_p.preprocess_input(p)
      q_img.put(arr)
      q_shape.put(shape)
예제 #6
0
    def __init__(self):
        global pool
        global pFpgaRT
        self.args = xdnn_io.processCommandLine()  # Get command line args
        pool = Pool(
            self.args['numproc']
        )  # Depends on new switch, new switch is not added to xdnn_io.py as of yet

        # Split Images into batches - list of lists
        self.batches = []
        self.all_image_paths = xdnn_io.getFilePaths(
            self.args['images'])  #[0:10000]
        for i in range(0, len(self.all_image_paths), self.args['batch_sz']):
            self.batches.append(self.all_image_paths[i:i +
                                                     self.args['batch_sz']])

        pFpgaRT = xdnn.XDNNFPGAOp(self.args)  # Parent process gets handle
        self.args['inShape'] = (self.args['batch_sz'], ) + tuple(
            tuple(pFpgaRT.getInputDescriptors().values())[0]
            [1:])  # Save input shape for children
        self.mpid = pFpgaRT.getMPID()  # Save handle to use in child processes
예제 #7
0
def main():
    args = xdnn_io.processCommandLine()
    images = xdnn_io.getFilePaths(args['images'])

    # start comms
    xserver = xstream.Server()

    # acquire resources
    fmaster = FpgaMaster(args['vitis_rundir'])
    inshape = list(fmaster.inshape)
    if args['batch_sz'] != -1:
        inshape[0] = args['batch_sz']  # update batch size

    # spawn dispatchers
    dispatcher = Dispatcher(g_nDispatchers, g_nWorkers, inshape)

    # spawn workers
    workers = WorkerPool(args['vitis_rundir'] + "_worker", g_nWorkers, args)

    # send work to system
    work = []
    for qIdx in range(g_nQueries):
        idx = qIdx * inshape[0]
        workBatch = [
            images[(idx + i) % len(images)] for i in range(inshape[0])
        ]
        work.append((qIdx, workBatch, (args['img_raw_scale'], args['img_mean'],
                                       args['img_input_scale'])))
    startTime = timeit.default_timer()
    dispatcher.run(work)
    del dispatcher
    t = timeit.default_timer() - startTime

    print("Queries: %d, Elapsed: %.2fs, QPS: %.2f, FPS: %.2f" \
      % (g_nQueries, t, g_nQueries / t, g_nQueries * inshape[0] / t))

    # cleanup
    del workers
    del fmaster
    del xserver
예제 #8
0
def main():
    parser = xdnn_io.default_parser_args()
    parser = yolo_parser_args(parser)
    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)

    g_nDispatchers = args['numprepproc']
    g_nWorkers = args['numworkers']

    # Setup the environment
    images = xdnn_io.getFilePaths(args['images'])
    if (args['golden'] or args['visualize']):
        assert args['labels'], "Provide --labels to compute mAP."
        assert args[
            'results_dir'], "For accuracy measurements, provide --results_dir to save the detections."

    # start comms
    xserver = xstream.Server()

    # acquire resources
    fmaster = FpgaMaster(args['vitis_rundir'])

    # update batch size
    inshape = list(fmaster.inshape)
    if args['batch_sz'] != -1:
        inshape[0] = args['batch_sz']

    args['net_h'] = inshape[2]
    args['net_w'] = inshape[3]

    # spawn dispatchers
    dispatcher = yoloDispatcher(g_nDispatchers, g_nWorkers, inshape)

    # spawn workers
    workers = yoloWorkerPool(args['vitis_rundir'] + "_worker", g_nWorkers,
                             args)

    # send work to system
    g_nQueries = int(np.ceil(len(images) / inshape[0]))
    work = []
    for qIdx in range(g_nQueries):
        idx = qIdx * inshape[0]
        workBatch = [
            images[(idx + i) % len(images)] for i in range(inshape[0])
        ]
        work.append((qIdx, workBatch, (args['img_raw_scale'], args['img_mean'],
                                       args['img_input_scale'])))

    startTime = timeit.default_timer()
    dispatcher.run(work)
    del dispatcher
    t = timeit.default_timer() - startTime

    print("Queries: %d, Elapsed: %.2fs, QPS: %.2f, FPS: %.2f" \
      % (g_nQueries, t, g_nQueries / t, g_nQueries * inshape[0] / t))
    sys.stdout.flush()

    # cleanup
    del workers
    del fmaster
    del xserver

    # mAP calculation
    if (args['golden']):
        print()
        print("Computing mAP score  : ")
        labels = xdnn_io.get_labels(args['labels'])
        print("Class names are  : {} ".format(labels))
        mAP = calc_detector_mAP(args['results_dir'], args['golden'],
                                len(labels), labels, args['prob_threshold'],
                                args['mapiouthresh'], args['points'])
        sys.stdout.flush()
예제 #9
0
def run(args=None):
  if not args:
    parser = xdnn_io.default_parser_args()
    parser.add_argument('--numprepproc', type=int, default=1,
                        help='number of parallel processes used to decode and quantize images')
    parser.add_argument('--numstream', type=int, default=16,
                        help='number of FPGA streams')
    parser.add_argument('--deviceID', type=int, default=0,
                        help='FPGA no. -> FPGA ID to run in case multiple FPGAs')
    parser.add_argument('--benchmarkmode', type=int, default=0,
                        help='bypass pre/post processing for benchmarking')
    parser.add_argument('--profile', action='store_true',
                        help='Print average latencies for preproc/exec/postproc')

    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)

  sharedInputArrs = []
  fpgaOutputs = []

  compilerJSONObj = xdnn.CompilerJsonParser(args['netcfg'])

  input_shapes = [v for k,v in compilerJSONObj.getInputs().items()]
  output_shapes = [v for k,v in compilerJSONObj.getOutputs().items()]

  for in_idx in range(len(input_shapes)):
      input_shapes[in_idx][0] = args['batch_sz']
  for out_idx in range(len(output_shapes)):
      output_shapes[out_idx][0] = args['batch_sz']

  input_sizes = map(lambda x: np.prod(x), input_shapes)
  output_sizes = map(lambda x: np.prod(x), output_shapes)

  num_shared_slots = args['numstream']

  # shared memory from preprocessing to fpga forward
  shared_trans_arrs = SharedMemoryQueue("trans",num_shared_slots*(args['numprepproc']*args['batch_sz']),
          input_shapes +[(args['batch_sz'], 4)])

  # shared memory from fpga forward to postprocessing
  shared_output_arrs = SharedMemoryQueue("output",num_shared_slots, output_shapes + [(args['batch_sz'], 4)])

  # Form list of images to chunks of batch_sz
  img_paths = xdnn_io.getFilePaths(args['images'])
  imgids = list(range(len(img_paths)))
  imgid_chunks = [ imgids[i:i+args['batch_sz']] for i in range(0, len(img_paths), args['batch_sz']) ]

  # Start all processes
  p = mp.Pool(initializer = init_pre_process,
    initargs = (args,  img_paths, input_shapes, shared_trans_arrs, ), processes = args['numprepproc'])

  xdnnProc = mp.Process(target=fpga_process, args=(args, len(imgid_chunks), compilerJSONObj,shared_trans_arrs,shared_output_arrs,))

  postProc = mp.Process(target=post_process, args=(args, img_paths, fpgaOutputs,output_shapes,shared_output_arrs,))
  xdnnProc.start()
  postProc.start()

  t1 = timeit.default_timer()
  if args['perpetual']:
    while True:
      res = [p.map_async(run_pre_process, imgid_chunks)]
      for j in res:
        j.wait()
        del j
  else:
    p.map_async(run_pre_process, imgid_chunks)

  xdnnProc.join()
  postProc.join()

  p.close()
  p.join()
  t2 = timeit.default_timer()
  total_t = t2 - t1
  if(args['profile']):
    print("Total time taken: {} s\n Total images: {}\nAverage FPS: {}".format(total_t, \
            len(img_paths), len(img_paths)/total_t))
예제 #10
0
def run(args=None):
    if not args:
        parser = xdnn_io.default_parser_args()
        parser = yolo_parser_args(parser)
        parser.add_argument('--startxstream',
                            default=True,
                            action='store_true',
                            help='automatically start obj store server')
        parser.add_argument('--servermode',
                            default=False,
                            action='store_true',
                            help='accept images from another process')
        parser.add_argument("--deploymodel",
                            type=str,
                            default='',
                            help='Original prototxt')
        parser.add_argument("--caffemodel",
                            type=str,
                            default='',
                            help='Original caffemodel')

        args = parser.parse_args()
        args = xdnn_io.make_dict_args(args)
        args['preprocseq'] = [('resize', (224, 224)),
                              ('meansub', [104.007, 116.669, 122.679]),
                              ('chtranspose', (2, 0, 1))]

    if (args['golden'] or args['visualize']):
        assert args['labels'], "Provide --labels to compute mAP."
        assert args[
            'results_dir'], "For accuracy measurements, provide --results_dir to save the detections."
        labels = xdnn_io.get_labels(args['labels'])
        colors = generate_colors(len(labels))

    args['startxstream'] = True
    args['servermode'] = False

    timerQ = Queue()
    args['timerQ'] = timerQ

    compJson = xdnn.CompilerJsonParser(args['netcfg'])
    firstInputShape = next(itervalues(compJson.getInputs()))
    args['net_h'] = firstInputShape[2]
    args['net_w'] = firstInputShape[3]

    # start object store
    # (make sure to 'pip install pyarrow')
    xserver = None
    if args['startxstream']:
        xserver = xstream.Server()

    graph = grapher.Graph("yolo_v2")
    graph.node("prep", yolov2_pre.Node, args)
    graph.node("fpga", yolov2_fpga.Node, args)
    graph.node("post", yolov2_post.Node, args)

    graph.edge("START", None, "prep")
    graph.edge("prep", "prep", "fpga")
    graph.edge("fpga", "fpga", "post")
    graph.edge("DONE", "post", "fpga")
    graph.edge("DONE", "post", None)

    if not args['servermode']:
        graph.serve(background=True)
        img_paths = xdnn_io.getFilePaths(args['images'])

        reqProc = mp.Process(target=request_process,
                             args=(
                                 args,
                                 img_paths,
                                 graph._in[0],
                                 graph._out[0],
                             ))

        t = timeit.default_timer()
        reqProc.start()
        reqProc.join()
        graph.stop(kill=False)
        t2 = args['timerQ'].get()
        full_time = t2 - t

        args['timerQ'].close()

        print("Total time : {}s for {} images".format(full_time,
                                                      len(img_paths)))
        print("Average FPS : {} imgs/sec".format(len(img_paths) / full_time))
    else:
        print("Serving %s -> %s" % (graph._in[0], graph._out[0]))
        graph.serve()

    # mAP calculation
    if (args['golden']):
        print(flush=True)
        print("Computing mAP score  : ", flush=True)
        print("Class names are  : {} ".format(labels), flush=True)
        mAP = calc_detector_mAP(args['results_dir'], args['golden'], len(labels), labels,\
                args['prob_threshold'], args['mapiouthresh'], args['points'])
        sys.stdout.flush()
예제 #11
0
def main():
    parser = xdnn_io.default_parser_args()
    parser = yolo_parser_args(parser)
    args = parser.parse_args()
    args = xdnn_io.make_dict_args(args)

    # Setup the environment
    img_paths = xdnn_io.getFilePaths(args['images'])
    if (args['golden'] or args['visualize']):
        assert args['labels'], "Provide --labels to compute mAP."
        assert args[
            'results_dir'], "For accuracy measurements, provide --results_dir to save the detections."
        labels = xdnn_io.get_labels(args['labels'])
        colors = generate_colors(len(labels))

    if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc
    elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc

    runner = Runner(args['vitis_rundir'])

    # Setup the blobs
    inTensors = runner.get_input_tensors()
    outTensors = runner.get_output_tensors()
    batch_sz = args['batch_sz']
    if batch_sz == -1:
        batch_sz = inTensors[0].dims[0]

    fpgaBlobs = []
    for io in [inTensors, outTensors]:
        blobs = []
        for t in io:
            shape = (batch_sz, ) + tuple([t.dims[i]
                                          for i in range(t.ndims)][1:])
            blobs.append(np.empty((shape), dtype=np.float32, order='C'))
        fpgaBlobs.append(blobs)
    fpgaInput = fpgaBlobs[0][0]

    # Setup the YOLO config
    net_h, net_w = fpgaInput.shape[-2:]
    args['net_h'] = net_h
    args['net_w'] = net_w
    biases = bias_selector(args)

    # Setup profiling env
    prep_time = 0
    exec_time = 0
    post_time = 0

    # Start the execution
    for i in range(0, len(img_paths), batch_sz):
        pl = []
        img_shapes = []

        # Prep images
        t1 = timeit.default_timer()
        for j, p in enumerate(img_paths[i:i + batch_sz]):
            fpgaInput[j, ...], img_shape = xdnn_io.loadYoloImageBlobFromFile(
                p, net_h, net_w)
            pl.append(p)
            img_shapes.append(img_shape)
        t2 = timeit.default_timer()

        # Execute
        jid = runner.execute_async(fpgaBlobs[0], fpgaBlobs[1])
        runner.wait(jid)

        # Post Proc
        t3 = timeit.default_timer()
        boxes = yolo_postproc(fpgaBlobs[1], args, img_shapes, biases=biases)
        t4 = timeit.default_timer()

        prep_time += (t2 - t1)
        exec_time += (t3 - t2)
        post_time += (t4 - t3)

        for i in range(min(batch_sz, len(img_shapes))):
            print("Detected {} boxes in {}".format(len(boxes[i]), pl[i]))

        # Save the result
        if (args['results_dir']):
            for i in range(min(batch_sz, len(img_shapes))):
                filename = os.path.splitext(os.path.basename(pl[i]))[0]
                out_file_txt = os.path.join(args['results_dir'],
                                            filename + '.txt')
                print("Saving {} boxes to {}".format(len(boxes[i]),
                                                     out_file_txt))
                sys.stdout.flush()
                saveDetectionDarknetStyle(out_file_txt, boxes[i],
                                          img_shapes[i])
                if (args['visualize']):
                    out_file_png = os.path.join(args['results_dir'],
                                                filename + '.png')
                    print("Saving result to {}".format(out_file_png))
                    sys.stdout.flush()
                    draw_boxes(pl[i], boxes[i], labels, colors, out_file_png)

    # Profiling results
    if (args['profile']):
        print("\nAverage Latency in ms:")
        print("  Image Prep: {0:3f}".format(prep_time * 1000.0 /
                                            len(img_paths)))
        print("  Exec: {0:3f}".format(exec_time * 1000.0 / len(img_paths)))
        print("  Post Proc: {0:3f}".format(post_time * 1000.0 /
                                           len(img_paths)))
        sys.stdout.flush()

    # mAP calculation
    if (args['golden']):
        print()
        print("Computing mAP score  : ")
        print("Class names are  : {} ".format(labels))
        mAP = calc_detector_mAP(args['results_dir'], args['golden'],
                                len(labels), labels, args['prob_threshold'],
                                args['mapiouthresh'], args['points'])
        sys.stdout.flush()
예제 #12
0
def yolo_gpu_inference(backend_path, image_dir, deploy_model, weights,
                       out_labels, IOU_threshold, scorethresh, mean_value,
                       pxscale, transpose, channel_swap, yolo_model,
                       num_classes, args):

    # Setup the environment
    images = xdnn_io.getFilePaths(args['images'])
    if (args['golden'] or args['visualize']):
        assert args['labels'], "Provide --labels to compute mAP."
        assert args[
            'results_dir'], "For accuracy measurements, provide --results_dir to save the detections."
        labels = xdnn_io.get_labels(args['labels'])
        colors = generate_colors(len(labels))

    # Select postproc and biases
    if args['yolo_version'] == 'v2': yolo_postproc = yolo.yolov2_postproc
    elif args['yolo_version'] == 'v3': yolo_postproc = yolo.yolov3_postproc
    biases = bias_selector(args)

    import caffe
    caffe.set_mode_cpu()
    print(args)
    if (args['gpu'] is not None):
        caffe.set_mode_gpu()
        caffe.set_device(args['gpu'])

    net = caffe.Net(deploy_model, weights, caffe.TEST)

    net_h, net_w = net.blobs['data'].data.shape[-2:]
    args['net_h'] = net_h
    args['net_w'] = net_w

    for i, img in enumerate(images):
        if ((i + 1) % 100 == 0): print(i + 1, "images processed")
        raw_img, img_shape = xdnn_io.loadYoloImageBlobFromFile(
            img, net_h, net_w)

        net.blobs['data'].data[...] = raw_img
        out = net.forward()

        caffeOutput = sorted(out.values(), key=lambda item: item.shape[-1])
        boxes = yolo_postproc(caffeOutput, args, [img_shape], biases=biases)

        print("{}. Detected {} boxes in {}".format(i, len(boxes[0]), img))

        # Save the result
        boxes = boxes[0]
        if (args['results_dir']):
            filename = os.path.splitext(os.path.basename(img))[0]
            out_file_txt = os.path.join(args['results_dir'], filename + '.txt')
            print("Saving {} boxes to {}".format(len(boxes), out_file_txt))
            sys.stdout.flush()
            saveDetectionDarknetStyle(out_file_txt, boxes, img_shape)
            if (args['visualize']):
                out_file_png = os.path.join(args['results_dir'],
                                            filename + '.png')
                print("Saving result to {}".format(out_file_png))
                sys.stdout.flush()
                draw_boxes(img, boxes, labels, colors, out_file_png)
        # draw_boxes(images[i],bboxes,class_names,colors=[(0,0,0)]*num_classes)

    return len(images)
예제 #13
0
def run(args=None):
    if not args:
        parser = xdnn_io.default_parser_args()
        parser.add_argument('--numprepproc',
                            type=int,
                            default=1,
                            help='# parallel procs to decode/quantize images')
        parser.add_argument('--numstream',
                            type=int,
                            default=6,
                            help='number of FPGA streams')
        parser.add_argument('--deviceID',
                            type=int,
                            default=0,
                            help='FPGA no. -> FPGA ID to use multiple FPGAs')
        parser.add_argument('--benchmarkmode',
                            type=int,
                            default=0,
                            help='bypass pre/post processing for benchmarking')
        parser.add_argument('--startxstream',
                            default=False,
                            action='store_true',
                            help='automatically start obj store server')
        parser.add_argument('--servermode',
                            default=False,
                            action='store_true',
                            help='accept images from another process')
        args = parser.parse_args()
        args = xdnn_io.make_dict_args(args)
        args['preprocseq'] = [('resize', (224, 224)),
                              ('meansub', [104.007, 116.669, 122.679]),
                              ('chtranspose', (2, 0, 1))]

    # start object store
    # (make sure to 'pip install pyarrow')
    xserver = None
    if args['startxstream']:
        xserver = xstream.Server()

    graph = grapher.Graph("imagenet")
    graph.node("prep", pre.Node, args)
    graph.node("fpga", fpga.Node, args)
    graph.node("post", post.Node, args)

    graph.edge("START", None, "prep")
    graph.edge("prep", "prep", "fpga")
    graph.edge("fpga", "fpga", "post")
    graph.edge("DONE", "post", "fpga")
    graph.edge("DONE", "post", None)

    if not args['servermode']:
        graph.serve(background=True)
        img_paths = xdnn_io.getFilePaths(args['images'])
        reqProc = mp.Process(target=request_process,
                             args=(
                                 args,
                                 img_paths,
                                 graph._in[0],
                                 graph._out[0],
                             ))
        reqProc.start()
        reqProc.join()
        graph.stop(kill=False)
    else:
        print("Serving %s -> %s" % (graph._in[0], graph._out[0]))
        graph.serve()